diff --git a/.clang-format b/.clang-format
index 4311596850..3e49ddce6e 100644
--- a/.clang-format
+++ b/.clang-format
@@ -8,6 +8,7 @@ AlignEscapedNewlines:         true
 AlignOperands:                Align
 AllowShortIfStatementsOnASingleLine: AllIfsAndElse
 ColumnLimit:                  150
+PenaltyBreakOpenParenthesis:  100
 ReflowComments:               false
 CommentPragmas:               'TESTARGS'
 DerivePointerAlignment:       false
diff --git a/.clang-tidy b/.clang-tidy
index ab45c266bf..04cd208737 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,3 +1,3 @@
-Checks: "clang-diagnostic-*,clang-analyzer-*,readability-inconsistent-declaration-parameter-name,bugprone-too-small-loop-variable"
+Checks: "clang-diagnostic-*,clang-analyzer-*,readability-inconsistent-declaration-parameter-name,bugprone-too-small-loop-variable,-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling"
 HeaderFilterRegex: .*
 WarningsAsErrors: "clang-diagnostic-*,clang-analyzer-*,readability-inconsistent-declaration-parameter-name"
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 0000000000..9c6f967319
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,12 @@
+Purpose:
+
+Describe the purpose of the PR here.
+
+Closes: #ISSUE_NUMBER
+
+LLM/GenAI Disclosure:
+
+Describe any LLM and GenAI usage here.
+
+By submitting this PR, the author certifies to its contents as described by the [Developer's Certificate of Origin](https://developercertificate.org/).
+Please follow the [Contributing Guidelines](https://github.com/CEED/libCEED/blob/main/CONTRIBUTING.md) for all PRs.
diff --git a/.github/workflows/c-fortran-test-hardware.yml b/.github/workflows/c-fortan-test-ppc64le.yml
similarity index 72%
rename from .github/workflows/c-fortran-test-hardware.yml
rename to .github/workflows/c-fortan-test-ppc64le.yml
index 7dd7626ebf..f710c9ba12 100644
--- a/.github/workflows/c-fortran-test-hardware.yml
+++ b/.github/workflows/c-fortan-test-ppc64le.yml
@@ -1,4 +1,4 @@
-name: ARM and IBM Power
+name: IBM Power
 
 on:
   push:
@@ -10,9 +10,9 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
-        compiler: [gcc-13]
-        arch: [aarch64, ppc64le]
+        os: [ubuntu-24.04]
+        compiler: [gcc]
+        arch: [ppc64le]
         distro: [ubuntu22.04]
 
     runs-on: ${{ matrix.os }}
@@ -21,10 +21,10 @@ jobs:
     - name: Environment setup
       uses: actions/checkout@v4
     - name: Hardware setup and test libCEED
-      uses: uraimo/run-on-arch-action@v2
+      uses: uraimo/run-on-arch-action@v3
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-13
+        FC: gfortran
       id: runcmd
       with:
         arch: ${{ matrix.arch }}
@@ -36,5 +36,5 @@ jobs:
           apt-get install -y python3
           uname -a
           make info
-          make -j2
-          PROVE_OPTS=-v make prove -j2
+          make -j
+          make prove -j search="t5 ex"
diff --git a/.github/workflows/c-fortran-test-arm64.yml b/.github/workflows/c-fortran-test-arm64.yml
new file mode 100644
index 0000000000..6927f37b68
--- /dev/null
+++ b/.github/workflows/c-fortran-test-arm64.yml
@@ -0,0 +1,28 @@
+name: ARM
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-24.04-arm]
+        compiler: [gcc, clang]
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+    - name: Environment setup
+      uses: actions/checkout@v4
+    - name: Build and test libCEED
+      env:
+        CC: ${{ matrix.compiler }}
+        FC: gfortran
+      run: |
+        make info
+        make -j
+        make prove -j
diff --git a/.github/workflows/c-fortran-test-icc.yml b/.github/workflows/c-fortran-test-icc.yml
index fc5f3407cd..4e854195b1 100644
--- a/.github/workflows/c-fortran-test-icc.yml
+++ b/.github/workflows/c-fortran-test-icc.yml
@@ -14,7 +14,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
 
     runs-on: ${{ matrix.os }}
 
@@ -32,6 +32,6 @@ jobs:
           export CC=icx CXX=icx FC=ifx
           export OPENMP=1
           make info
-          make -j2
-          PROVE_OPTS=-v make prove -j2
+          make -j
+          make prove -j
 
diff --git a/.github/workflows/c-fortran-test-linux-osx.yml b/.github/workflows/c-fortran-test-linux-osx.yml
index 806cbcc16d..52df23c8d1 100644
--- a/.github/workflows/c-fortran-test-linux-osx.yml
+++ b/.github/workflows/c-fortran-test-linux-osx.yml
@@ -10,19 +10,44 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04, macos-13]
-        compiler: [gcc-13, clang]
+        os: [ubuntu-24.04, macos-15]
+        compiler: [gcc, clang]
+        include:
+          - os: macos-15
+            compiler: apple-clang
 
     runs-on: ${{ matrix.os }}
 
     steps:
     - name: Environment setup
       uses: actions/checkout@v4
+    - name: Set compiler
+      run: |
+        case "${{ matrix.compiler }}" in
+          gcc)
+            if [[ "${{ matrix.os }}" == macos-* ]]; then
+              echo "CC=gcc-15" >> $GITHUB_ENV
+            else
+              echo "CC=gcc" >> $GITHUB_ENV
+            fi
+            ;;
+          clang)
+            if [[ "${{ matrix.os }}" == macos-* ]]; then
+              echo "CC=$(brew --prefix llvm@18)/bin/clang" >> $GITHUB_ENV
+            else
+              echo "CC=clang" >> $GITHUB_ENV
+            fi
+            ;;
+          apple-clang)
+            echo "CC=clang" >> $GITHUB_ENV
+            ;;
+        esac
+    - name: Show compiler version
+      run: $CC --version | head -1
     - name: Build and test libCEED
       env:
-        CC: ${{ matrix.compiler }}
-        FC: gfortran-13
+        FC: gfortran-14
       run: |
         make info
-        make -j2
-        PROVE_OPTS=-v make prove -j2
+        make -j
+        make prove -j2
diff --git a/.github/workflows/c-fortran-test-style.yml b/.github/workflows/c-fortran-test-style.yml
index 4f2fcb4054..ff55101bde 100644
--- a/.github/workflows/c-fortran-test-style.yml
+++ b/.github/workflows/c-fortran-test-style.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
         compiler: [clang]
 
     runs-on: ${{ matrix.os }}
@@ -21,12 +21,12 @@ jobs:
     - name: Install clang-format
       run: |
           wget -O- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
-          sudo add-apt-repository 'deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-17 main'
-          sudo apt update && sudo apt install clang-format-17
+          sudo add-apt-repository 'deb http://apt.llvm.org/noble/ llvm-toolchain-noble-19 main'
+          sudo apt update && sudo apt install clang-format-19
     - name: C style
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-11
+        FC: gfortran
       run: |
         make info
-        make format-c -j2 CLANG_FORMAT=clang-format-17 && git diff --exit-code
+        make format-c -j CLANG_FORMAT=clang-format-19 && git diff --exit-code
diff --git a/.github/workflows/julia-documentation.yml b/.github/workflows/julia-documentation.yml
index d7a432426f..b90bb1bb1e 100644
--- a/.github/workflows/julia-documentation.yml
+++ b/.github/workflows/julia-documentation.yml
@@ -9,7 +9,7 @@ on:
 
 jobs:
   build:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@latest
diff --git a/.github/workflows/julia-test-with-style.yml b/.github/workflows/julia-test-with-style.yml
index b74434ff49..a292c9550b 100644
--- a/.github/workflows/julia-test-with-style.yml
+++ b/.github/workflows/julia-test-with-style.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
         julia-version: ['1']
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/python-test-with-style.yml b/.github/workflows/python-test-with-style.yml
index a8fc0af33c..4c2764b244 100644
--- a/.github/workflows/python-test-with-style.yml
+++ b/.github/workflows/python-test-with-style.yml
@@ -10,8 +10,8 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
-        compiler: [gcc-13]
+        os: [ubuntu-24.04]
+        compiler: [gcc]
         python-version: ['3.x']
 
     runs-on: ${{ matrix.os }}
@@ -30,7 +30,7 @@ jobs:
     - name: Python test
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-13
+        FC: gfortran
       run: |
         make info
         make -j2
@@ -38,16 +38,18 @@ jobs:
         pip install .
         cd python/tests
         PYTHON=python3 make test TEST_OPTS="--ceed /cpu/self/ref/serial -vv"
+        cd ../../examples/python
+        PYTHON=python3 make test TEST_OPTS="--ceed /cpu/self/ref/serial -vv"
         cd ../..
     - name: Python style
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-13
+        FC: gfortran
       run: |
         make format-py && git diff --exit-code
     - name: Python version
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-13
+        FC: gfortran
       run: |
         make vermin
diff --git a/.github/workflows/release-notes.yml b/.github/workflows/release-notes.yml
index 8d90b2490d..a4fa213618 100644
--- a/.github/workflows/release-notes.yml
+++ b/.github/workflows/release-notes.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
 
     runs-on: ${{ matrix.os }}
 
diff --git a/.github/workflows/rust-documentation.yml b/.github/workflows/rust-documentation.yml
index b0ca00c440..4d6410548a 100644
--- a/.github/workflows/rust-documentation.yml
+++ b/.github/workflows/rust-documentation.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
         compiler: [clang]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/rust-test-with-style.yml b/.github/workflows/rust-test-with-style.yml
index 0626ecb989..63ce2d7b3c 100644
--- a/.github/workflows/rust-test-with-style.yml
+++ b/.github/workflows/rust-test-with-style.yml
@@ -10,7 +10,7 @@ jobs:
   test:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04, macos-15]
         compiler: [clang]
 
     runs-on: ${{ matrix.os }}
@@ -31,8 +31,8 @@ jobs:
     - name: Rust test with coverage
       env:
         CC: ${{ matrix.compiler }}
-        FC: gfortran-11
-      run: cargo llvm-cov test --doctests --lcov --output-path lcov.info
+        FC: gfortran
+      run: CARGO_CEED_OPT_FLAGS="-g -O0 -fno-inline" cargo llvm-cov test --doctests --lcov --output-path lcov.info
     - name: Codecov upload
       uses: codecov/codecov-action@v4
       with:
@@ -42,7 +42,7 @@ jobs:
   style:
     strategy:
       matrix:
-        os: [ubuntu-22.04]
+        os: [ubuntu-24.04]
         compiler: [clang]
 
     runs-on: ${{ matrix.os }}
diff --git a/.gitignore b/.gitignore
index 7e7115b20e..0cb9d41a69 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ lib/*
 # General
 *.o
 *.so
+*.so.*
 *.d
 *.DIR
 ceed.pc
@@ -50,6 +51,12 @@ doc/sphinx/build/
 # Example docs automatically copied from source tree
 doc/sphinx/source/examples/
 
+# Clang GPU temp files
+temp/*
+
+# Nek5K
+SESSION.NAME
+
 # Output files, videos, and compressed archives should not be added accidentally
 *.avi
 *.bin
@@ -91,3 +98,10 @@ libCEED.includes
 *.aux
 *.fdb_latexmk
 *.fls
+
+# profiling files
+*.txt
+*.proto
+*.csv
+
+.venv
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 58f7e93f59..d3188148a5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -5,6 +5,10 @@ stages:
   - test:cpu-and-tidy
   - test:gpu-and-float
 
+workflow:
+  auto_cancel:
+    on_job_failure: all
+
 
 # ----------------------------------------------------------------------------------------
 # Memcheck backends + ASAN
@@ -15,36 +19,37 @@ noether-asan:
     - cpu
   interruptible: true
   before_script:
-# Environment
-#    Note: COVERAGE=0 is needed when using ASAN
+    # Environment
+    #    Note: COVERAGE=0 is needed when using ASAN
     - export COVERAGE=0 CC=gcc CXX=g++ FC=gfortran
     - export NPROC_POOL=8
     - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
     - echo "-------------- CC ------------------" && $CC --version
     - echo "-------------- CXX -----------------" && $CXX --version
     - echo "-------------- FC ------------------" && $FC --version
-# ASAN
+    # ASAN
     - echo "-------------- ASAN ----------------"
     - export ASAN=1 AFLAGS="-fsanitize=address -fsanitize=leak"
     - echo $AFLAGS
   script:
     - rm -f .SUCCESS
-# libCEED
-    - make configure OPT='-O -march=native -ffp-contract=fast'
+    # libCEED
+    - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast'
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU
+    - make clean
     - make -j$NPROC_CPU
-# -- libCEED only tests
+    # -- libCEED only tests
     - echo "-------------- core tests ----------"
     - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json
-#    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="memcheck" junit realsearch=%
-# Clang-tidy
+    # Clang-tidy
     - echo "-------------- clang-tidy ----------" && clang-tidy --version
     - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
-# Report status
+    # Report status
     - touch .SUCCESS
   artifacts:
     paths:
@@ -63,7 +68,7 @@ noether-cpu:
     - cpu
   interruptible: true
   before_script:
-# Environment
+    # Environment
     - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran
     - export NPROC_POOL=8
     - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
@@ -71,55 +76,52 @@ noether-cpu:
     - echo "-------------- CXX -----------------" && $CXX --version
     - echo "-------------- FC ------------------" && $FC --version
     - echo "-------------- GCOV ----------------" && gcov --version
-# Libraries for backends
-# -- LIBXSMM 2c145a109b5a8ad4e15f60ea42a86b9056bdc8b8
-    - cd .. && export XSMM_HASH=2c145a109b5a8ad4e15f60ea42a86b9056bdc8b8 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
+    # Libraries for backends
+    # -- LIBXSMM 7 April 2024
+    - cd .. && export XSMM_HASH=94ee71576870152feb62f3f0cf6b061d036dcdb5 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
     - echo "-------------- LIBXSMM -------------" && basename $XSMM_DIR
-# -- OCCA v1.6.0
-    - cd .. && export OCCA_VERSION=occa-1.6.0 && { [[ -d $OCCA_VERSION ]] || { git clone --depth 1 --branch v1.6.0 https://github.com/libocca/occa.git $OCCA_VERSION && cd $OCCA_VERSION && export ENABLE_OPENCL="OFF" ENABLE_DPCPP="OFF" ENABLE_HIP="OFF" ENABLE_CUDA="OFF" && ./configure-cmake.sh && cmake --build build --parallel $NPROC_CPU && cmake --install build && cd ..; }; } && export OCCA_DIR=$PWD/$OCCA_VERSION/install && cd libCEED
-    - echo "-------------- OCCA ----------------" && git -C $OCCA_DIR describe --tags && LD_LIBRARY_PATH=$OCCA_DIR/lib $OCCA_DIR/bin/occa info
   script:
     - rm -f .SUCCESS
-# libCEED
-    - make configure OPT='-O -march=native -ffp-contract=fast'
+    # libCEED
+    - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast'
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU
-    - OCCA_DIR= PEDANTIC=1 make -j$NPROC_CPU
+    - make clean
+    - PEDANTIC=1 make -j$NPROC_CPU
     - make -j$NPROC_CPU
-# -- libCEED only tests
+    # -- libCEED only tests
     - echo "-------------- core tests ----------"
     - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json
-#    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit realsearch=%
-# Libraries for examples
-# -- PETSc with HIP (minimal)
-    - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip-int64 && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
-    - source /home/jawr8143/SmartSimTestingSoftware/bin/activate && export SMARTREDIS_DIR=/home/jawr8143/SmartSimTestingSoftware/smartredis/install
+    # Libraries for examples
+    # -- PETSc (minimal)
+    - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cpu-int64 && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
-    - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="petsc fluids solids"
-# -- MFEM v4.6
-    - cd .. && export MFEM_VERSION=mfem-4.6 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.6 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
+    - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search="petsc fluids-navierstokes solids"
+    # -- MFEM v4.7
+    - cd .. && export MFEM_VERSION=mfem-4.7 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.7 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
     - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search=mfem
-# -- Nek5000 v19.0
+    # -- Nek5000 v19.0
     - export COVERAGE=0
     - cd .. && export NEK5K_VERSION=Nek5000-19.0 && { [[ -d $NEK5K_VERSION ]] || { git clone --depth 1 --branch v19.0 https://github.com/Nek5000/Nek5000.git $NEK5K_VERSION && cd $NEK5K_VERSION/tools && ./maketools genbox genmap reatore2 && cd ../..; }; } && export NEK5K_DIR=$PWD/$NEK5K_VERSION && export PATH=$NEK5K_DIR/bin:$PATH MPI=0 && cd libCEED
     - echo "-------------- Nek5000 -------------" && git -C $NEK5K_DIR describe --tags
     - export NPROC_POOL=1
     - make -k -j$NPROC_CPU BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search=nek NEK5K_DIR=$NEK5K_DIR
-# -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11
-    - OCCA_DIR= BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
+    # -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11
+    - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ')
     - export DEAL_II_ROOT_DIR=/projects/dealii DEAL_II_DIR=/projects/dealii/install
     - echo "-------------- deal.II -------------" && git -C $DEAL_II_ROOT_DIR -c safe.directory=$DEAL_II_ROOT_DIR describe --always
     - make -k -j$NPROC_CPU BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="cpu" junit search=dealii DEAL_II_DIR=$DEAL_II_DIR
-# Report status
+    # Report status
     - touch .SUCCESS
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -137,29 +139,88 @@ noether-cpu:
 # ----------------------------------------------------------------------------------------
 # Check SYCL backends build
 # ----------------------------------------------------------------------------------------
-noether-sycl:
+
+# SYCL tests currently disabled
+
+#noether-sycl:
+#  stage: test:gpu-and-float
+#  tags:
+#    - sycl
+#  interruptible: true
+#  before_script:
+#    # Environment
+#    - . /opt/intel/oneapi/setvars.sh
+#    - export COVERAGE=1 CC=icx CXX=icpx
+#    - export NPROC_POOL=8
+#    - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
+#    - echo "-------------- CC ------------------" && $CC --version
+#    - echo "-------------- CXX -----------------" && $CXX --version
+#  script:
+#    - rm -f .SUCCESS
+#    # libCEED
+#    - make configure SYCL_DIR=/opt/intel/oneapi/compiler/latest OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast'
+#    - BACKENDS_SYCL=$(make info-backends-all | grep -o '/sycl[^ ]*' | tr '\n' ' ')
+#    - echo "-------------- libCEED -------------" && make info
+#    - echo "-------------- BACKENDS_SYCL -------" && echo $BACKENDS_SYCL
+#    - make clean
+#    - make -j$NPROC_CPU
+#    # Report status
+#    - touch .SUCCESS
+
+
+# ----------------------------------------------------------------------------------------
+# Rust + CUDA
+# ----------------------------------------------------------------------------------------
+noether-rust-qfunctions:
   stage: test:gpu-and-float
   tags:
-    - sycl
+    - cuda
   interruptible: true
   before_script:
-# Environment
-    - . /opt/intel/oneapi/setvars.sh
-    - export COVERAGE=1 CC=icx CXX=icpx
-    - export NPROC_POOL=8
+    # Environment
+    - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran NVCC=nvcc GPU_CLANG=1
+    - export NPROC_POOL=1
     - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
     - echo "-------------- CC ------------------" && $CC --version
     - echo "-------------- CXX -----------------" && $CXX --version
+    - echo "-------------- FC ------------------" && $FC --version
+    - echo "-------------- NVCC ----------------" && $NVCC --version
+    - echo "-------------- Rustc ---------------" && rustc --version
+    - echo "-------------- Clang++ -------------" && clang++ --version
+    - echo "-------------- GCOV ----------------" && gcov --version
   script:
     - rm -f .SUCCESS
-# libCEED
-    - make configure SYCL_DIR=/opt/intel/oneapi/compiler/latest OPT='-O -march=native -ffp-contract=fast'
-    - BACKENDS_SYCL=$(make info-backends-all | grep -o '/sycl[^ ]*' | tr '\n' ' ')
+    # Rustup
+    - rustup update nightly
+    - rustup component add rust-src --toolchain nightly
+    - rustup component add llvm-tools --toolchain nightly
+    # libCEED
+    - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
     - echo "-------------- libCEED -------------" && make info
-    - echo "-------------- BACKENDS_SYCL -------" && echo $BACKENDS_SYCL
-    - make -j$NPROC_CPU
-# Report status
+    - make clean
+    - make -k -j$NPROC_CPU -l$NPROC_CPU
+    # -- libCEED only tests
+    - echo "-------------- Rust QFunction tests -----"
+    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+    - export PETSC_DIR= PETSC_ARCH=
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="rust-qfunction" junit search=rustqfunction
+    # Report status
     - touch .SUCCESS
+  after_script:
+    - |
+      if [ -f .SUCCESS ]; then
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F tests;
+        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F examples;
+      fi
+  artifacts:
+    paths:
+      - build/*.junit
+    reports:
+      junit: build/*.junit
 
 
 # ----------------------------------------------------------------------------------------
@@ -171,7 +232,7 @@ noether-cuda:
     - cuda
   interruptible: true
   before_script:
-# Environment
+    # Environment
     - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran NVCC=nvcc
     - export NPROC_POOL=4
     - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
@@ -180,34 +241,56 @@ noether-cuda:
     - echo "-------------- FC ------------------" && $FC --version
     - echo "-------------- NVCC ----------------" && $NVCC --version
     - echo "-------------- GCOV ----------------" && gcov --version
+    # ASAN
+    - echo "-------------- ASAN ----------------"
+    - export ASAN=1 AFLAGS="-fsanitize=address -fsanitize=leak" ASAN_OPTIONS=protect_shadow_gap=0
+    - echo $AFLAGS
   script:
     - rm -f .SUCCESS
-# libCEED
-    - make configure OPT='-O -march=native -ffp-contract=fast' CUDA_DIR=/usr
+    # libCEED
+    - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
     - echo "-------------- libCEED -------------" && make info
     - BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
+    - make clean
     - PEDANTIC=1 make -k -j$NPROC_CPU -l$NPROC_CPU
-# -- libCEED only tests
+    # -- libCEED only tests
     - echo "-------------- core tests ----------"
-#    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit realsearch=%
-# Libraries for examples
-# -- PETSc with CUDA (minimal)
+    # Rebuild without ASAN
+    - unset ASAN AFLAGS ASAN_OPTIONS
+    - make clean
+    - PEDANTIC=1 make -k -j$NPROC_CPU -l$NPROC_CPU
+    # Libraries for examples
+    # -- PETSc with CUDA (minimal)
     - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-cuda-O PETSC_OPTIONS='-use_gpu_aware_mpi 0' && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
-    - source /home/jawr8143/SmartSimTestingSoftware/bin/activate && export SMARTREDIS_DIR=/home/jawr8143/SmartSimTestingSoftware/smartredis/install
     - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) JUNIT_BATCH="cuda" junit BACKENDS="$BACKENDS_GPU" search="petsc fluids solids"
-# Clang-tidy
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search="petsc fluids solids"
+    # -- MFEM v4.7
+    - cd .. && export MFEM_VERSION=mfem-4.7 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.7 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
+    - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search=mfem
+    # -- Nek5000 v19.0
+    - export COVERAGE=0
+    - cd .. && export NEK5K_VERSION=Nek5000-19.0 && { [[ -d $NEK5K_VERSION ]] || { git clone --depth 1 --branch v19.0 https://github.com/Nek5000/Nek5000.git $NEK5K_VERSION && cd $NEK5K_VERSION/tools && ./maketools genbox genmap reatore2 && cd ../..; }; } && export NEK5K_DIR=$PWD/$NEK5K_VERSION && export PATH=$NEK5K_DIR/bin:$PATH MPI=0 && cd libCEED
+    - echo "-------------- Nek5000 -------------" && git -C $NEK5K_DIR describe --tags
+    - export NPROC_POOL=1
+    - make -k -j$NPROC_GPU BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search=nek NEK5K_DIR=$NEK5K_DIR
+    # -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11
+    - export DEAL_II_ROOT_DIR=/projects/dealii DEAL_II_DIR=/projects/dealii/install
+    - echo "-------------- deal.II -------------" && git -C $DEAL_II_ROOT_DIR -c safe.directory=$DEAL_II_ROOT_DIR describe --always
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="cuda" junit search=dealii DEAL_II_DIR=$DEAL_II_DIR
+    # Clang-tidy
     - echo "-------------- clang-tidy ----------" && clang-tidy --version
     - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
-# Report status
+    # Report status
     - touch .SUCCESS
   after_script:
     - |
       if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
@@ -224,13 +307,78 @@ noether-cuda:
 # ----------------------------------------------------------------------------------------
 # ROCm backends
 # ----------------------------------------------------------------------------------------
+
+# ROCm test execution currently disabled
+
+#noether-rocm:
+#  stage: test:gpu-and-float
+#  tags:
+#    - rocm
+#  interruptible: true
+#  before_script:
+#    # Environment
+#    - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran HIPCC=hipcc
+#    - export NPROC_POOL=4
+#    - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
+#    - echo "-------------- CC ------------------" && $CC --version
+#    - echo "-------------- CXX -----------------" && $CXX --version
+#    - echo "-------------- FC ------------------" && $FC --version
+#    - echo "-------------- HIPCC ---------------" && $HIPCC --version
+#    - echo "-------------- GCOV ----------------" && gcov --version
+#    # Libraries for backends
+#    # -- MAGMA from dev branch
+#    - echo "-------------- MAGMA ---------------"
+#    - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
+#  script:
+#    - rm -f .SUCCESS
+#    # libCEED
+#    - make configure ROCM_DIR=/opt/rocm-6.3.0 OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast'
+#    - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
+#    - echo "-------------- libCEED -------------" && make info
+#    - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
+#    - make clean
+#    - make -j$NPROC_CPU
+#    # -- libCEED only tests
+#    - echo "-------------- core tests ----------"
+#    - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json
+#    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+#    - export PETSC_DIR= PETSC_ARCH=
+#    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit realsearch=%
+#    # Libraries for examples
+#    # -- PETSc with HIP (minimal)
+#    - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
+#    - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
+#    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search="petsc fluids solids"
+#    # Clang-tidy
+#    - echo "-------------- clang-tidy ----------" && clang-tidy --version
+#    - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
+#    # Report status
+#    - touch .SUCCESS
+#  after_script:
+#    - |
+#      if [ -f .SUCCESS ]; then
+#        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
+#        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
+#        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
+#        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
+#        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F tests;
+#        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F examples;
+#      fi
+#  artifacts:
+#    paths:
+#      - build/*.junit
+#    reports:
+#      junit: build/*.junit
+#      performance: performance.json
+
+
 noether-rocm:
   stage: test:gpu-and-float
   tags:
     - rocm
   interruptible: true
   before_script:
-# Environment
+    # Environment
     - export COVERAGE=1 CC=gcc CXX=g++ FC=gfortran HIPCC=hipcc
     - export NPROC_POOL=4
     - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
@@ -239,117 +387,80 @@ noether-rocm:
     - echo "-------------- FC ------------------" && $FC --version
     - echo "-------------- HIPCC ---------------" && $HIPCC --version
     - echo "-------------- GCOV ----------------" && gcov --version
-# Libraries for backends
-# -- MAGMA from dev branch
+    # Libraries for backends
+    # -- MAGMA from dev branch
     - echo "-------------- MAGMA ---------------"
     - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
   script:
     - rm -f .SUCCESS
-# libCEED
-    - make configure ROCM_DIR=/opt/rocm-5.6.0 OPT='-O -march=native -ffp-contract=fast'
+    # libCEED
+    - make configure ROCM_DIR=/opt/rocm-6.3.0 OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast'
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
+    - make clean
     - make -j$NPROC_CPU
-# -- libCEED only tests
-    - echo "-------------- core tests ----------"
-    - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json
-#    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
-    - export PETSC_DIR= PETSC_ARCH=
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit realsearch=%
-# Libraries for examples
-# -- PETSc with HIP (minimal)
-    - export PETSC_DIR=/projects/petsc PETSC_ARCH=mpich-hip && git -C $PETSC_DIR -c safe.directory=$PETSC_DIR describe
-    - echo "-------------- PETSc ---------------" && make -C $PETSC_DIR info
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search="petsc fluids solids"
-# -- MFEM v4.6
-    - cd .. && export MFEM_VERSION=mfem-4.6 && { [[ -d $MFEM_VERSION ]] || { git clone --depth 1 --branch v4.6 https://github.com/mfem/mfem.git $MFEM_VERSION && make -C $MFEM_VERSION -j$(nproc) serial CXXFLAGS="-O -std=c++11"; }; } && export MFEM_DIR=$PWD/$MFEM_VERSION && cd libCEED
-    - echo "-------------- MFEM ----------------" && make -C $MFEM_DIR info
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search=mfem
-# -- Nek5000 v19.0
-    - export COVERAGE=0
-    - cd .. && export NEK5K_VERSION=Nek5000-19.0 && { [[ -d $NEK5K_VERSION ]] || { git clone --depth 1 --branch v19.0 https://github.com/Nek5000/Nek5000.git $NEK5K_VERSION && cd $NEK5K_VERSION/tools && ./maketools genbox genmap reatore2 && cd ../..; }; } && export NEK5K_DIR=$PWD/$NEK5K_VERSION && export PATH=$NEK5K_DIR/bin:$PATH MPI=0 && cd libCEED
-    - echo "-------------- Nek5000 -------------" && git -C $NEK5K_DIR describe --tags
-    - export NPROC_POOL=1
-    - make -k -j$NPROC_GPU BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search=nek NEK5K_DIR=$NEK5K_DIR
-# -- deal.II 8bd5c262f13e15793aa206b6eed8774a9b25ce11
-    - export DEAL_II_ROOT_DIR=/projects/dealii DEAL_II_DIR=/projects/dealii/install
-    - echo "-------------- deal.II -------------" && git -C $DEAL_II_ROOT_DIR -c safe.directory=$DEAL_II_ROOT_DIR describe --always
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="hip" junit search=dealii DEAL_II_DIR=$DEAL_II_DIR
-# Clang-tidy
+    # Clang-tidy
     - echo "-------------- clang-tidy ----------" && clang-tidy --version
+    - make clean
     - TIDY_OPTS="-fix-errors" make -j$NPROC_CPU tidy && git diff --color=always --exit-code
-# Report status
+    # Report status
     - touch .SUCCESS
-  after_script:
-    - |
-      if [ -f .SUCCESS ]; then
-        lcov --directory . --capture --output-file coverage.info;
-        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
-        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
-        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
-        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F tests;
-        bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F examples;
-      fi
-  artifacts:
-    paths:
-      - build/*.junit
-    reports:
-      junit: build/*.junit
-      performance: performance.json
 
 
 # ----------------------------------------------------------------------------------------
-# CPU + ROCm backends with CeedScalar == float (32 bit)
+# CPU + CUDA backends with CeedScalar == float (32 bit)
 # ----------------------------------------------------------------------------------------
 noether-float:
   stage: test:gpu-and-float
   tags:
     - cpu
-    - rocm
+    - cuda
   interruptible: true
   before_script:
-# Environment
-    - export COVERAGE=1 CC=gcc CXX=g++ FC= HIPCC=hipcc
+    # Environment
+    - export COVERAGE=1 CC=gcc CXX=g++ FC= NVCC=nvcc
     - export NPROC_POOL=8
     - echo "-------------- nproc ---------------" && NPROC_CPU=$(nproc) && NPROC_GPU=$(($(nproc)<8?$(nproc):8)) && echo "NPROC_CPU" $NPROC_CPU && echo "NPROC_GPU" $NPROC_GPU
     - echo "-------------- CC ------------------" && $CC --version
     - echo "-------------- CXX -----------------" && $CXX --version
-    - echo "-------------- HIPCC ---------------" && $HIPCC --version
+    - echo "-------------- NVCC ----------------" && $NVCC --version
     - echo "-------------- GCOV ----------------" && gcov --version
-# Libraries for backends
+    # Libraries for backends
+# ROCm tests currently disabled
 # -- MAGMA from dev branch
-    - echo "-------------- MAGMA ---------------"
-    - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
-# -- LIBXSMM 2c145a109b5a8ad4e15f60ea42a86b9056bdc8b8
-    - cd .. && export XSMM_HASH=2c145a109b5a8ad4e15f60ea42a86b9056bdc8b8 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
+#    - echo "-------------- MAGMA ---------------"
+#    - export MAGMA_DIR=/projects/hipMAGMA && git -C $MAGMA_DIR -c safe.directory=$MAGMA_DIR describe
+    # -- LIBXSMM 19 March 2025
+    - cd .. && export XSMM_HASH=ba9d6bc69c421c10f0597d582ea1ace6a6126308 && { [[ -d libxsmm-$XSMM_HASH ]] || { curl -L https://github.com/libxsmm/libxsmm/archive/$XSMM_HASH.tar.gz -o xsmm.tar.gz && tar zvxf xsmm.tar.gz && rm xsmm.tar.gz && make -C libxsmm-$XSMM_HASH -j$(nproc); }; } && export XSMM_DIR=$PWD/libxsmm-$XSMM_HASH && cd libCEED
     - echo "-------------- LIBXSMM -------------" && basename $XSMM_DIR
   script:
     - rm -f .SUCCESS
-# libCEED
-# Change to single precision
+    # libCEED
+    # Change to single precision
     - sed -i 's/ceed-f64/ceed-f32/1' include/ceed/types.h
-# Build libCEED
-    - make configure ROCM_DIR=/opt/rocm-5.6.0 OPT='-O -march=native -ffp-contract=fast'
+    # Build libCEED
+    - make configure OPT='-g -O0 -fno-inline -march=native -ffp-contract=fast' CUDA_DIR=/usr/local/cuda-12.9
     - BACKENDS_CPU=$(make info-backends-all | grep -o '/cpu[^ ]*' | tr '\n' ' ') && BACKENDS_GPU=$(make info-backends | grep -o '/gpu[^ ]*' | tr '\n' ' ')
     - echo "-------------- libCEED -------------" && make info
     - echo "-------------- BACKENDS_CPU --------" && echo $BACKENDS_CPU
     - echo "-------------- BACKENDS_GPU --------" && echo $BACKENDS_GPU
+    - make clean
     - make -j$NPROC_CPU
-# -- libCEED only tests
+    # -- libCEED only tests
     - echo "-------------- core tests ----------"
     - echo '[{"subject":"/","metrics":[{"name":"Transfer Size (KB)","value":"19.5","desiredSize":"smaller"},{"name":"Speed Index","value":0,"desiredSize":"smaller"},{"name":"Total Score","value":92,"desiredSize":"larger"},{"name":"Requests","value":4,"desiredSize":"smaller"}]}]' > performance.json
-#    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
+    #    Note: PETSC_DIR is set by default in GitLab runner env, unsetting to isolate core tests
     - export PETSC_DIR= PETSC_ARCH=
     - make -k -j$((NPROC_CPU / NPROC_POOL)) BACKENDS="$BACKENDS_CPU" JUNIT_BATCH="float-cpu" junit realsearch=%
     - export NPROC_POOL=4
-    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="float-hip" junit realsearch=%
-# Report status
+    - make -k -j$((NPROC_GPU / NPROC_POOL)) BACKENDS="$BACKENDS_GPU" JUNIT_BATCH="float-cuda" junit realsearch=%
+    # Report status
     - echo "SUCCESS" > .job_status
   after_script:
     - |
       if [ $(cat .job_status) == "SUCCESS" ]; then
-        lcov --directory . --capture --output-file coverage.info;
+        lcov --directory . --capture --output-file coverage.info --ignore-errors source,mismatch,unused --substitute 's#(t*-f.h)#test/(t*.-f.h)#g';
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F interface;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F gallery;
         bash <(curl -s https://codecov.io/bash) -f coverage.info -t ${CODECOV_ACCESS_TOKEN} -F backends;
diff --git a/.mailmap b/.mailmap
index 61c3cab2b6..787aaa2c49 100644
--- a/.mailmap
+++ b/.mailmap
@@ -6,7 +6,11 @@
 #
 # See 'git help shortlog' for details
 
+Adeleke Bankole <adeleke.bankole@colorado.edu>
+Adeleke Bankole <adeleke.bankole@colorado.edu>          <86932837+AdelekeBankole@users.noreply.github.com>
 Ahmad Abdelfattah <ahmad@icl.utk.edu>                   <36712794+abdelfattah83@users.noreply.github.com>
+Allen MacFarland <Allen.MacFarland@colorado.edu>        <79958059+SirAlienTheGreat@users.noreply.github.com>
+Alex Pedersen <ajpedersen20@gmail.com>                  <54287657+ajpedersen20@users.noreply.github.com>
 Arash Mehraban <arashm81@gmail.com>                     <ArashMehraban@users.noreply.github.com>
 David Medina <dmed256@gmail.com>
 James Wright <james@jameswright.xyz>                    <jameswright@jameswright.xyz>
@@ -16,13 +20,21 @@ Jeremy L. Thompson <jeremy@jeremylt.org>                <25011573+jeremylt@users
 Jeremy L. Thompson <jeremy@jeremylt.org>                <jeth8984@noether>
 Jeremy L. Thompson <jeremy@jeremylt.org>                <jeremy.thompson@colorado.edu>
 Jeremy L. Thompson <jeremy@jeremylt.org>                <thompson.jeremy.luke@gmail.com>
-Leila Ghaffari <Leila.Ghaffari@colorado.edu>            <49916147+LeilaGhaffari@users.noreply.github.com>
-Leila Ghaffari <Leila.Ghaffari@colorado.edu>            <leila@Leilas-MacBook-Pro.local>
+Kenneth E. Jansen <Kenneth.Jansen@colorado.edu>         <kenneth.jansen@colorado.edu>
+Layla Ghaffari <Layla.Ghaffari@colorado.edu>            <Leila.Ghaffari@colorado.edu>
+Layla Ghaffari <Layla.Ghaffari@colorado.edu>            <49916147+LeilaGhaffari@users.noreply.github.com>
+Layla Ghaffari <Layla.Ghaffari@colorado.edu>            <leila@Leilas-MacBook-Pro.local>
+Natalie Beams <nbeams@icl.utk.edu>
 Natalie Beams <nbeams@icl.utk.edu>                      <246972+nbeams@users.noreply.github.com>
 Rey Koki <rey.koki@colorado.edu>                        <36133157+reykoki@users.noreply.github.com>
 Rezgar Shakeri <Rezgar.Shakeri@colorado.edu>            <42816410+rezgarshakeri@users.noreply.github.com>
+Rezgar Shakeri <Rezgar.Shakeri@colorado.edu>            <rezgar.shakeri@colorado.edu>
+Riccardo Balin <riccardo.balin@gmail.com>               <balin@uan-0001.head.cm.americas.sgi.com>
+Riccardo Balin <riccardo.balin@gmail.com>               <balin@uan-0002.head.cm.americas.sgi.com>
+Thilina Ratnayaka <thilinarmtb@gmail.com>
 Thilina Ratnayaka <thilinarmtb@gmail.com>               <thilinarmtb@users.noreply.github.com>
 Tzanio Kolev <tzanio@llnl.gov>
+Umesh Unnikrishnan <unnikrishnan@anl.gov>               <umesh.aero@gatech.edu>
 Valeria Barra <valeriabarra21@gmail.com>
 Valeria Barra <valeriabarra21@gmail.com>                <39932030+valeriabarra@users.noreply.github.com>
 Valeria Barra <valeriabarra21@gmail.com>                <vaba3353@shas0136.rc.int.colorado.edu>
@@ -31,3 +43,6 @@ Valeria Barra <valeriabarra21@gmail.com>                <valeria.barra@colorado.
 Will Pazner <will.e.p@gmail.com>                        <11493037+pazner@users.noreply.github.com>
 Yohann Dudouit <dudouit1@llnl.gov>
 Yohann Dudouit <dudouit1@llnl.gov>                      <yohann.dudouit@gmail.com>
+Zach Atkins <Zach.Atkins@colorado.edu>                  <zach.atkins@colorado.edu>
+Zach Atkins <Zach.Atkins@colorado.edu>                  <zachary.r.atkins@pm.me>
+Zach Atkins <Zach.Atkins@colorado.edu>                  <zacharyjayhawk@gmail.com>
diff --git a/.readthedocs.yml b/.readthedocs.yml
index b530328523..2b173bda47 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -5,14 +5,15 @@
 version: 2
 
 build:
-  os: ubuntu-22.04
+  os: ubuntu-24.04
   tools:
-    python: "3.11"
-    nodejs: "19"
+    python: "3.13"
+    nodejs: "23"
   apt_packages:
     - librsvg2-bin
   jobs:
     post_create_environment:
+      - npx playwright install
       - npm install -g @mermaid-js/mermaid-cli
 
 # Build documentation in the docs/ directory with Sphinx
diff --git a/AUTHORS b/AUTHORS
index 8c6e400008..adc091e7a1 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,11 +1,12 @@
 Ahmad Abdelfattah
+Zachary R. Atkins
 Valeria Barra
 Natalie Beams
 Jed Brown
 Jean-Sylvain Camier
 Veselin Dobrev
 Yohann Dudouit
-Leila Ghaffari
+Layla Ghaffari
 Sebastian Grimberg
 Tzanio Kolev
 David Medina
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 01e35d0de3..2f48726d3e 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -37,7 +37,7 @@ Examples of representing our community include using an official e-mail address,
 
 ## Enforcement
 
-Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at jed@jedbrown.org, valeria@caltech.edu, or tzanio@llnl.gov.
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at jed@jedbrown.org, vbarra@sdsu.edu, or tzanio@llnl.gov.
 All complaints will be reviewed and investigated promptly and fairly.
 
 All community leaders are obligated to respect the privacy and security of the reporter of any incident.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d3eeeb5c03..5f0cdbf4fd 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -44,6 +44,18 @@ By making a contribution to this project, I certify that:
 
 (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
 
+## LLM Generated Content
+
+libCEED is a research software project, and we require citation of the origin of ideas in the same way that citations are expected for research papers.
+See the [San Francisco Declaration on Research Assessment](https://sfdora.org/read) for discussion on treating other research outputs, such as datasets and software, as first class artifacts like research papers.
+
+LLM/GenAI generated code can contain novel algorithms developed by other researchers and replicated without attribution.
+As such, we cannot accept pull requests containing code predominantly generated by LLM/GenAI.
+
+LLMs may be used to aid the development of code for pull requests (PR); however, the individual submitting the PR must certify to its contents as described by Developer's Certificate of Origin.
+The human creating the PR is ultimately responsible for the content in the PR.
+PRs must disclose and describe all LLM usage.
+
 ## Authorship
 
 libCEED contains components authored by many individuals.
diff --git a/Cargo.toml b/Cargo.toml
index a987ca8a95..83aaac7b46 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,10 +1,13 @@
 [workspace]
 members = [
-        "rust/libceed",
-        "rust/libceed-sys",
-        "examples/rust/ex1-volume",
-        "examples/rust/ex2-surface",
-        "examples/rust/ex3-vector-volume",
-        "examples/rust/ex4-vector-surface",
-        "examples/rust/mesh",
+    "rust/libceed",
+    "rust/libceed-sys",
+    "examples/rust/ex1-volume",
+    "examples/rust/ex1-volume-vector",
+    "examples/rust/ex2-surface",
+    "examples/rust/ex2-surface-vector",
+    "examples/rust/ex3-volume",
+    "examples/rust/ex3-volume-vector",
+    "examples/rust/mesh",
 ]
+exclude = ["examples/rust-qfunctions/ex1-volume-rs"]
diff --git a/LICENSE b/LICENSE
index ec06a37c93..85888e282b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 BSD 2-Clause License
 
-Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/Makefile b/Makefile
index 69f05b0673..f824ba2cd8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,15 +1,64 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
 #
 # This file is part of CEED:  http://github.com/ceed
 
+# ------------------------------------------------------------
+# Configuration
+# ------------------------------------------------------------
+
+# config.mk stores cached configuration variables
 CONFIG ?= config.mk
 -include $(CONFIG)
+
+# common.mk holds definitions used in various makefiles throughout the project
 COMMON ?= common.mk
 -include $(COMMON)
 
+# Quiet, color output
+quiet ?= $($(1))
+
+# Cancel built-in and old-fashioned implicit rules which we don't use
+.SUFFIXES:
+
+.SECONDEXPANSION: # to expand $$(@D)/.DIR
+
+%/.DIR :
+	@mkdir -p $(@D)
+	@touch $@
+
+.PRECIOUS: %/.DIR
+
+
+DARWIN := $(filter Darwin,$(shell uname -s))
+
+
+# ------------------------------------------------------------
+# Root directories for backend dependencies
+# ------------------------------------------------------------
+
+# XSMM_DIR env variable should point to XSMM main (github.com/hfp/libxsmm)
+XSMM_DIR ?= ../libxsmm
+
+# Often /opt/cuda or /usr/local/cuda, but sometimes present on machines that don't support CUDA
+CUDA_DIR  ?=
+CUDA_ARCH ?=
+
+# Often /opt/rocm, but sometimes present on machines that don't support HIP
+ROCM_DIR ?=
+HIP_ARCH ?=
+
+# env variable MAGMA_DIR can be used too
+MAGMA_DIR ?= ../magma
+
+
+# ------------------------------------------------------------
+# Compiler flags
+# ------------------------------------------------------------
+
+# Detect user compiler options and set defaults
 ifeq (,$(filter-out undefined default,$(origin CC)))
   CC = gcc
 endif
@@ -26,8 +75,11 @@ ifeq (,$(filter-out undefined default,$(origin AR)))
   AR = ar
 endif
 ifeq (,$(filter-out undefined default,$(origin ARFLAGS)))
-  ARFLAGS = crD
+  ARFLAGS = $(if $(DARWIN),cr,crD)
 endif
+# Often /opt/rocm, but sometimes present on machines that don't support HIP
+ROCM_DIR ?= ${HIP_DIR}
+HIP_ARCH ?=
 NVCC ?= $(CUDA_DIR)/bin/nvcc
 NVCC_CXX ?= $(CXX)
 HIPCC ?= $(ROCM_DIR)/bin/hipcc
@@ -39,6 +91,17 @@ ifneq ($(EMSCRIPTEN),)
   EM_LDFLAGS = -s TOTAL_MEMORY=256MB
 endif
 
+HIP_CONFIG_RES := $(shell $(ROCM_DIR)/bin/hipconfig)
+ifneq (,$(findstring __HIP_PLATFORM_SPIRV__,$(HIP_CONFIG_RES)))
+  HIP_LIB_NAME = CHIP
+else ifneq (,$(findstring __HIP_PLATFORM_HCC__,$(HIP_CONFIG_RES)))
+  HIP_LIB_NAME = amdhip64
+else ifneq (,$(findstring __HIP_PLATFORM_AMD__,$(HIP_CONFIG_RES)))
+  HIP_LIB_NAME = amdhip64
+else 
+  $(error "HIP platform not supported")
+endif
+
 # ASAN must be left empty if you don't want to use it
 ASAN ?=
 
@@ -47,69 +110,28 @@ ASAN ?=
 # if any. If the user sets CEED_LDFLAGS or CEED_LDLIBS, they are used *instead
 # of* what we populate here (thus that's advanced usage and not recommended).
 CEED_LDFLAGS ?=
-CEED_LDLIBS ?=
+CEED_LDLIBS  ?=
 
 UNDERSCORE ?= 1
 
 # Verbose mode, V or VERBOSE
 V ?= $(VERBOSE)
 
-# MFEM_DIR env variable should point to sibling directory
-ifneq ($(wildcard ../mfem/libmfem.*),)
-  MFEM_DIR ?= ../mfem
-endif
-
-# NEK5K_DIR env variable should point to sibling directory
-ifneq ($(wildcard ../Nek5000/*),)
-  NEK5K_DIR ?= $(abspath ../Nek5000)
-endif
-export NEK5K_DIR
-MPI ?= 1
-
-# DEAL_II_DIR env variable should point to sibling directory
-ifneq ($(wildcard ../dealii/install/lib/libdeal_II.*),)
-  DEAL_II_DIR ?= ../dealii/install
-endif
-export DEAL_II_DIR
-
-# CEED_DIR env for NEK5K testing
-export CEED_DIR = $(abspath .)
-
-# XSMM_DIR env variable should point to XSMM main (github.com/hfp/libxsmm)
-XSMM_DIR ?= ../libxsmm
-
-# OCCA_DIR env variable should point to OCCA main (github.com/libocca/occa)
-OCCA_DIR ?= ../occa/install
-
-# env variable MAGMA_DIR can be used too
-MAGMA_DIR ?= ../magma
-
-# Often /opt/cuda or /usr/local/cuda, but sometimes present on machines that don't support CUDA
-CUDA_DIR  ?=
-CUDA_ARCH ?=
-
-# Often /opt/rocm, but sometimes present on machines that don't support HIP
-ROCM_DIR ?=
-HIP_ARCH ?=
-
-# Check for PETSc in ../petsc
-ifneq ($(wildcard ../petsc/lib/libpetsc.*),)
-  PETSC_DIR ?= ../petsc
-endif
-
-# SmartSim testing
-SMARTREDIS_DIR ?=
-
-# Warning: SANTIZ options still don't run with /gpu/occa
+# SANTIZ options
 AFLAGS ?= -fsanitize=address #-fsanitize=undefined -fno-omit-frame-pointer
 
 # Note: Intel oneAPI C/C++ compiler is now icx/icpx
-CC_VENDOR := $(firstword $(filter gcc (GCC) clang icc icc_orig oneAPI XL emcc,$(subst -, ,$(shell $(CC) --version))))
+CC_VENDOR := $(firstword $(filter gcc (GCC) clang cc icc icc_orig oneAPI XL emcc,$(subst -, ,$(shell $(CC) --version))))
 CC_VENDOR := $(subst (GCC),gcc,$(subst icc_orig,icc,$(CC_VENDOR)))
+CC_VENDOR := $(if $(filter cc,$(CC_VENDOR)),gcc,$(CC_VENDOR))
 FC_VENDOR := $(if $(FC),$(firstword $(filter GNU ifort ifx XL,$(shell $(FC) --version 2>&1 || $(FC) -qversion))))
 
+# Host architecture for setting appropriate flags
+UNAME_M := $(shell uname -m)
+
 # Default extra flags by vendor
-MARCHFLAG.gcc           := -march=native
+# GCC: use -march=native only on x86 (where -mcpu doesn't exist); use -mcpu=native elsewhere
+MARCHFLAG.gcc           := $(if $(filter x86_64 i%86,$(UNAME_M)),-march=native,-mcpu=native)
 MARCHFLAG.clang         := $(MARCHFLAG.gcc)
 MARCHFLAG.icc           :=
 MARCHFLAG.oneAPI        := $(MARCHFLAG.clang)
@@ -130,7 +152,7 @@ OPT.clang               := $(OPT.gcc)
 OPT.icc                 := $(OPT.gcc)
 OPT.oneAPI              := $(OPT.clang)
 OPT.emcc                :=
-CFLAGS.gcc              := $(if $(STATIC),,-fPIC) -std=c99 -Wall -Wextra -Wno-unused-parameter -MMD -MP
+CFLAGS.gcc              := $(if $(STATIC),,-fPIC) -std=c11 -Wall -Wextra -Wno-unused-parameter -MMD -MP
 CFLAGS.clang            := $(CFLAGS.gcc)
 CFLAGS.icc              := $(CFLAGS.gcc)
 CFLAGS.oneAPI           := $(CFLAGS.clang)
@@ -162,18 +184,19 @@ OMP_SIMD_FLAG := $(if $(call cc_check_flag,$(OMP_SIMD_FLAG)),$(OMP_SIMD_FLAG))
 PEDANTIC      ?=
 PEDANTICFLAGS ?= -Werror -pedantic
 
+# Compiler flags
 OPT    ?= -O $(MARCHFLAG) $(OPT.$(CC_VENDOR)) $(OMP_SIMD_FLAG)
 CFLAGS ?= $(OPT) $(CFLAGS.$(CC_VENDOR)) $(if $(PEDANTIC),$(PEDANTICFLAGS))
 CXXFLAGS ?= $(OPT) $(CXXFLAGS.$(CC_VENDOR)) $(if $(PEDANTIC),$(PEDANTICFLAGS))
 FFLAGS ?= $(OPT) $(FFLAGS.$(FC_VENDOR))
 LIBCXX ?= -lstdc++
-NVCCFLAGS ?= -ccbin $(CXX) -Xcompiler "$(OPT)" -Xcompiler -fPIC
+NVCCFLAGS ?= -ccbin $(CXX) -Xcompiler '$(OPT)' -Xcompiler -fPIC
 ifneq ($(CUDA_ARCH),)
   NVCCFLAGS += -arch=$(CUDA_ARCH)
 endif
 HIPCCFLAGS ?= $(filter-out $(OMP_SIMD_FLAG),$(OPT)) -fPIC -munsafe-fp-atomics
 ifneq ($(HIP_ARCH),)
-  HIPCCFLAGS += --amdgpu-target=$(HIP_ARCH)
+  HIPCCFLAGS += --offload-arch=$(HIP_ARCH)
 endif
 SYCL_FLAG := $(SYCL_FLAG.$(CC_VENDOR))
 SYCLFLAGS ?= $(SYCL_FLAG) -fPIC -std=c++17 $(filter-out -std=c++11,$(CXXFLAGS)) $(filter-out $(OMP_SIMD_FLAG),$(OPT))
@@ -201,7 +224,6 @@ OBJDIR := build
 for_install := $(filter install,$(MAKECMDGOALS))
 LIBDIR := $(if $(for_install),$(OBJDIR),lib)
 
-
 # Installation variables
 prefix ?= /usr/local
 bindir = $(prefix)/bin
@@ -221,7 +243,6 @@ MFLAGS := -j $(NPROCS) --warn-undefined-variables \
 PYTHON ?= python3
 PROVE ?= prove
 PROVE_OPTS ?= -j $(NPROCS)
-DARWIN := $(filter Darwin,$(shell uname -s))
 SO_EXT := $(if $(DARWIN),dylib,so)
 
 ceed.pc := $(LIBDIR)/pkgconfig/ceed.pc
@@ -229,64 +250,80 @@ libceed.so := $(LIBDIR)/libceed.$(SO_EXT)
 libceed.a := $(LIBDIR)/libceed.a
 libceed := $(if $(STATIC),$(libceed.a),$(libceed.so))
 CEED_LIBS = -lceed
-libceed.c := $(filter-out interface/ceed-cuda.c interface/ceed-hip.c interface/ceed-jit-source-root-$(if $(for_install),default,install).c, $(wildcard interface/ceed*.c backends/*.c gallery/*.c))
-gallery.c := $(wildcard gallery/*/ceed*.c)
-libceed.c += $(gallery.c)
 libceeds = $(libceed)
 BACKENDS_BUILTIN := /cpu/self/ref/serial /cpu/self/ref/blocked /cpu/self/opt/serial /cpu/self/opt/blocked
 BACKENDS_MAKE := $(BACKENDS_BUILTIN)
-TEST_BACKENDS := /cpu/self/tmpl /cpu/self/tmpl/sub
 
-# Tests
-tests.c   := $(sort $(wildcard tests/t[0-9][0-9][0-9]-*.c))
-tests.f   := $(if $(FC),$(sort $(wildcard tests/t[0-9][0-9][0-9]-*.f90)))
-tests     := $(tests.c:tests/%.c=$(OBJDIR)/%$(EXE_SUFFIX))
-ctests    := $(tests)
-tests     += $(tests.f:tests/%.f90=$(OBJDIR)/%$(EXE_SUFFIX))
-# Examples
-examples.c := $(sort $(wildcard examples/ceed/*.c))
-examples.f := $(if $(FC),$(sort $(wildcard examples/ceed/*.f)))
-examples   := $(examples.c:examples/ceed/%.c=$(OBJDIR)/%$(EXE_SUFFIX))
-examples   += $(examples.f:examples/ceed/%.f=$(OBJDIR)/%$(EXE_SUFFIX))
-# MFEM Examples
-mfemexamples.cpp := $(sort $(wildcard examples/mfem/*.cpp))
-mfemexamples  := $(mfemexamples.cpp:examples/mfem/%.cpp=$(OBJDIR)/mfem-%)
-# Nek5K Examples
-nekexamples   := $(OBJDIR)/nek-bps
-# PETSc Examples
-petscexamples.c := $(wildcard examples/petsc/*.c)
-petscexamples   := $(petscexamples.c:examples/petsc/%.c=$(OBJDIR)/petsc-%)
-# deal.II Examples
-dealiiexamples  := $(OBJDIR)/dealii-bps
-# Fluid Dynamics Examples
-fluidsexamples.c  := $(sort $(wildcard examples/fluids/*.c))
-fluidsexamples.py := examples/fluids/smartsim_regression_framework.py
-fluidsexamples    := $(fluidsexamples.c:examples/fluids/%.c=$(OBJDIR)/fluids-%)
-fluidsexamples    += $(fluidsexamples.py:examples/fluids/%.py=$(OBJDIR)/fluids-py-%)
-# Solid Mechanics Examples
-solidsexamples.c  := $(sort $(wildcard examples/solids/*.c))
-solidsexamples    := $(solidsexamples.c:examples/solids/%.c=$(OBJDIR)/solids-%)
-
-# Backends/[ref, blocked, memcheck, opt, avx, occa, magma]
+
+# ------------------------------------------------------------
+# Root directories for examples using external libraries
+# ------------------------------------------------------------
+
+# DEAL_II_DIR env variable should point to sibling directory
+ifneq ($(wildcard ../dealii/install/lib/libdeal_II.*),)
+  DEAL_II_DIR ?= ../dealii/install
+endif
+# Export for deal.II testing
+export DEAL_II_DIR
+
+# MFEM_DIR env variable should point to sibling directory
+ifneq ($(wildcard ../mfem/libmfem.*),)
+  MFEM_DIR ?= ../mfem
+endif
+
+# NEK5K_DIR env variable should point to sibling directory
+ifneq ($(wildcard ../Nek5000/*),)
+  NEK5K_DIR ?= $(abspath ../Nek5000)
+endif
+# Exports for NEK5K testing
+export CEED_DIR = $(abspath .)
+export NEK5K_DIR
+MPI ?= 1
+
+# Check for PETSc in ../petsc
+ifneq ($(wildcard ../petsc/lib/libpetsc.*),)
+  PETSC_DIR ?= ../petsc
+endif
+
+# ------------------------------------------------------------
+# Build the library (default target)
+# ------------------------------------------------------------
+
+lib: $(libceed) $(ceed.pc)
+# run 'lib' target in parallel
+par:;@$(MAKE) $(MFLAGS) V=$(V) lib
+
+$(libceed.so) : CEED_LDFLAGS += $(if $(DARWIN), -install_name @rpath/$(notdir $(libceed.so)))
+
+# ------------------------------------------------------------
+# Source files
+# ------------------------------------------------------------
+
+# Interface and gallery
+libceed.c := $(filter-out interface/ceed-cuda.c interface/ceed-hip.c interface/ceed-jit-source-root-$(if $(for_install),default,install).c, $(wildcard interface/ceed*.c backends/weak/*.c gallery/*.c))
+gallery.c := $(wildcard gallery/*/ceed*.c)
+libceed.c += $(gallery.c)
+
+# Backends
+# - CPU
 ref.c          := $(sort $(wildcard backends/ref/*.c))
 blocked.c      := $(sort $(wildcard backends/blocked/*.c))
 ceedmemcheck.c := $(sort $(wildcard backends/memcheck/*.c))
 opt.c          := $(sort $(wildcard backends/opt/*.c))
 avx.c          := $(sort $(wildcard backends/avx/*.c))
 xsmm.c         := $(sort $(wildcard backends/xsmm/*.c))
+# - GPU
 cuda.c         := $(sort $(wildcard backends/cuda/*.c))
 cuda.cpp       := $(sort $(wildcard backends/cuda/*.cpp))
 cuda-ref.c     := $(sort $(wildcard backends/cuda-ref/*.c))
 cuda-ref.cpp   := $(sort $(wildcard backends/cuda-ref/*.cpp))
 cuda-ref.cu    := $(sort $(wildcard backends/cuda-ref/kernels/*.cu))
 cuda-shared.c  := $(sort $(wildcard backends/cuda-shared/*.c))
-cuda-shared.cu := $(sort $(wildcard backends/cuda-shared/kernels/*.cu))
 cuda-gen.c     := $(sort $(wildcard backends/cuda-gen/*.c))
 cuda-gen.cpp   := $(sort $(wildcard backends/cuda-gen/*.cpp))
-cuda-gen.cu    := $(sort $(wildcard backends/cuda-gen/kernels/*.cu))
-occa.cpp       := $(sort $(shell find backends/occa -type f -name *.cpp))
-magma.c        := $(sort $(wildcard backends/magma/*.c))
-magma.cpp      := $(sort $(wildcard backends/magma/*.cpp))
+cuda-all.c     := interface/ceed-cuda.c $(cuda.c) $(cuda-ref.c) $(cuda-shared.c) $(cuda-gen.c)
+cuda-all.cpp   := $(cuda.cpp) $(cuda-ref.cpp) $(cuda-gen.cpp)
+cuda-all.cu    := $(cuda-ref.cu)
 hip.c          := $(sort $(wildcard backends/hip/*.c))
 hip.cpp        := $(sort $(wildcard backends/hip/*.cpp))
 hip-ref.c      := $(sort $(wildcard backends/hip-ref/*.c))
@@ -295,34 +332,104 @@ hip-ref.hip    := $(sort $(wildcard backends/hip-ref/kernels/*.hip.cpp))
 hip-shared.c   := $(sort $(wildcard backends/hip-shared/*.c))
 hip-gen.c      := $(sort $(wildcard backends/hip-gen/*.c))
 hip-gen.cpp    := $(sort $(wildcard backends/hip-gen/*.cpp))
+hip-all.c      := interface/ceed-hip.c $(hip.c) $(hip-ref.c) $(hip-shared.c) $(hip-gen.c)
+hip-all.cpp    := $(hip.cpp) $(hip-ref.cpp) $(hip-gen.cpp)
+hip-all.hip    := $(hip-ref.hip)
 sycl-core.cpp  := $(sort $(wildcard backends/sycl/*.sycl.cpp))
 sycl-ref.cpp   := $(sort $(wildcard backends/sycl-ref/*.sycl.cpp))
 sycl-shared.cpp:= $(sort $(wildcard backends/sycl-shared/*.sycl.cpp))
 sycl-gen.cpp   := $(sort $(wildcard backends/sycl-gen/*.sycl.cpp))
+magma.c        := $(sort $(wildcard backends/magma/*.c))
+magma.cpp      := $(sort $(wildcard backends/magma/*.cpp))
 
-hip-all.c := interface/ceed-hip.c $(hip.c) $(hip-ref.c) $(hip-shared.c) $(hip-gen.c)
-hip-all.cpp := $(hip.cpp) $(hip-ref.cpp) $(hip-gen.cpp)
+# Tests
+tests.c := $(sort $(wildcard tests/t[0-9][0-9][0-9]-*.c))
+tests.f := $(if $(FC),$(sort $(wildcard tests/t[0-9][0-9][0-9]-*.f90)))
+tests   := $(tests.c:tests/%.c=$(OBJDIR)/%$(EXE_SUFFIX))
+ctests  := $(tests)
+tests   += $(tests.f:tests/%.f90=$(OBJDIR)/%$(EXE_SUFFIX))
 
-# Quiet, color output
-quiet ?= $($(1))
+# Examples
+examples.c := $(sort $(wildcard examples/ceed/*.c))
+examples.f := $(if $(FC),$(sort $(wildcard examples/ceed/*.f90)))
+examples   := $(examples.c:examples/ceed/%.c=$(OBJDIR)/%$(EXE_SUFFIX))
+examples   += $(examples.f:examples/ceed/%.f90=$(OBJDIR)/%$(EXE_SUFFIX))
 
-# Cancel built-in and old-fashioned implicit rules which we don't use
-.SUFFIXES:
+# deal.II Examples
+dealiiexamples.cc := $(sort $(wildcard examples/deal.II/*.cc))
+dealiiexamples    := $(dealiiexamples.cc:examples/deal.II/%.cc=$(OBJDIR)/dealii-%)
 
-.SECONDEXPANSION: # to expand $$(@D)/.DIR
+# MFEM Examples
+mfemexamples.cpp := $(sort $(wildcard examples/mfem/*.cpp))
+mfemexamples     := $(mfemexamples.cpp:examples/mfem/%.cpp=$(OBJDIR)/mfem-%)
 
-%/.DIR :
-	@mkdir -p $(@D)
-	@touch $@
+# Nek5K Examples
+nekexamples := $(OBJDIR)/nek-bps
 
-.PRECIOUS: %/.DIR
+# Rust QFunction Examples
+rustqfunctions.c       := $(sort $(wildcard examples/rust-qfunctions/*.c))
+rustqfunctionsexamples := $(rustqfunctions.c:examples/rust-qfunctions/%.c=$(OBJDIR)/rustqfunctions-%)
+
+# PETSc Examples
+petscexamples.c := $(wildcard examples/petsc/*.c)
+petscexamples   := $(petscexamples.c:examples/petsc/%.c=$(OBJDIR)/petsc-%)
+
+# Fluid Dynamics Example
+fluidsexamples.c := $(sort $(wildcard examples/fluids/*.c))
+fluidsexamples   := $(fluidsexamples.c:examples/fluids/%.c=$(OBJDIR)/fluids-%)
+
+# Solid Mechanics Example
+solidsexamples.c := $(sort $(wildcard examples/solids/*.c))
+solidsexamples   := $(solidsexamples.c:examples/solids/%.c=$(OBJDIR)/solids-%)
+
+
+# ------------------------------------------------------------
+# View configuration options
+# ------------------------------------------------------------
 
-lib: $(libceed) $(ceed.pc)
-# run 'lib' target in parallel
-par:;@$(MAKE) $(MFLAGS) V=$(V) lib
 backend_status = $(if $(filter $1,$(BACKENDS_MAKE)), [backends: $1], [not found])
+
+info-basic:
+	$(info -----------------------------------------)
+	$(info |     ___ __    ______________________  |)
+	$(info |    / (_) /_  / ____/ ____/ ____/ __ \ |)
+	$(info |   / / / __ \/ /   / __/ / __/ / / / / |)
+	$(info |  / / / /_/ / /___/ /___/ /___/ /_/ /  |)
+	$(info | /_/_/_.___/\____/_____/_____/_____/   |)
+	$(info -----------------------------------------)
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Built-in Backends:)
+	$(info   $(BACKENDS_BUILTIN))
+	$(info )
+	$(info Additional Backends:)
+	$(info   $(filter-out $(BACKENDS_BUILTIN),$(BACKENDS)))
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	@true
+
 info:
-	$(info ------------------------------------)
+	$(info -----------------------------------------)
+	$(info |     ___ __    ______________________  |)
+	$(info |    / (_) /_  / ____/ ____/ ____/ __ \ |)
+	$(info |   / / / __ \/ /   / __/ / __/ / / / / |)
+	$(info |  / / / /_/ / /___/ /___/ /___/ /_/ /  |)
+	$(info | /_/_/_.___/\____/_____/_____/_____/   |)
+	$(info -----------------------------------------)
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Built-in Backends:)
+	$(info   $(BACKENDS_BUILTIN))
+	$(info )
+	$(info Additional Backends:)
+	$(info   $(filter-out $(BACKENDS_BUILTIN),$(BACKENDS)))
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Compiler Flags:)
 	$(info CC            = $(CC))
 	$(info CXX           = $(CXX))
 	$(info FC            = $(FC))
@@ -341,35 +448,54 @@ info:
 	$(info AFLAGS        = $(AFLAGS))
 	$(info ASAN          = $(or $(ASAN),(empty)))
 	$(info VERBOSE       = $(or $(V),(empty)) [verbose=$(if $(V),on,off)])
-	$(info ------------------------------------)
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Backend Dependencies:)
 	$(info MEMCHK_STATUS = $(MEMCHK_STATUS)$(call backend_status,$(MEMCHK_BACKENDS)))
 	$(info AVX_STATUS    = $(AVX_STATUS)$(call backend_status,$(AVX_BACKENDS)))
 	$(info XSMM_DIR      = $(XSMM_DIR)$(call backend_status,$(XSMM_BACKENDS)))
-	$(info OCCA_DIR      = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS)))
-	$(info MAGMA_DIR     = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
 	$(info CUDA_DIR      = $(CUDA_DIR)$(call backend_status,$(CUDA_BACKENDS)))
 	$(info ROCM_DIR      = $(ROCM_DIR)$(call backend_status,$(HIP_BACKENDS)))
 	$(info SYCL_DIR      = $(SYCL_DIR)$(call backend_status,$(SYCL_BACKENDS)))
-	$(info ------------------------------------)
+	$(info MAGMA_DIR     = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Example Dependencies:)
 	$(info MFEM_DIR      = $(MFEM_DIR))
 	$(info NEK5K_DIR     = $(NEK5K_DIR))
 	$(info PETSC_DIR     = $(PETSC_DIR))
 	$(info DEAL_II_DIR   = $(DEAL_II_DIR))
-	$(info ------------------------------------)
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Install Options:)
 	$(info prefix        = $(prefix))
 	$(info includedir    = $(value includedir))
 	$(info libdir        = $(value libdir))
 	$(info pkgconfigdir  = $(value pkgconfigdir))
-	$(info ------------------------------------)
+	$(info )
+	$(info -----------------------------------------)
+	$(info )
+	$(info Git:)
+	$(info describe      = $(GIT_DESCRIBE))
+	$(info )
+	$(info -----------------------------------------)
 	@true
+
 info-backends:
 	$(info make: 'lib' with optional backends: $(filter-out $(BACKENDS_BUILTIN),$(BACKENDS)))
 	@true
+
 info-backends-all:
-	$(info make: 'lib' with backends: $(filter-out $(TEST_BACKENDS),$(BACKENDS)))
+	$(info make: 'lib' with backends: $(BACKENDS))
 	@true
 
-$(libceed.so) : CEED_LDFLAGS += $(if $(DARWIN), -install_name @rpath/$(notdir $(libceed.so)))
+
+# ------------------------------------------------------------
+# Backends
+# ------------------------------------------------------------
 
 # Standard Backends
 libceed.c += $(ref.c)
@@ -423,22 +549,6 @@ ifneq ($(wildcard $(XSMM_DIR)/lib/libxsmm.*),)
   BACKENDS_MAKE += $(XSMM_BACKENDS)
 endif
 
-# OCCA Backends
-OCCA_BACKENDS = /cpu/self/occa
-ifneq ($(wildcard $(OCCA_DIR)/lib/libocca.*),)
-  OCCA_MODES := $(shell LD_LIBRARY_PATH=$(OCCA_DIR)/lib $(OCCA_DIR)/bin/occa modes)
-  OCCA_BACKENDS += $(if $(filter OpenMP,$(OCCA_MODES)),/cpu/openmp/occa)
-  OCCA_BACKENDS += $(if $(filter dpcpp,$(OCCA_MODES)),/gpu/dpcpp/occa)
-  OCCA_BACKENDS += $(if $(filter OpenCL,$(OCCA_MODES)),/gpu/opencl/occa)
-  OCCA_BACKENDS += $(if $(filter HIP,$(OCCA_MODES)),/gpu/hip/occa)
-  OCCA_BACKENDS += $(if $(filter CUDA,$(OCCA_MODES)),/gpu/cuda/occa)
-  $(libceeds) : CPPFLAGS += -I$(OCCA_DIR)/include
-  PKG_LIBS += -L$(abspath $(OCCA_DIR))/lib -locca
-  LIBCEED_CONTAINS_CXX = 1
-  libceed.cpp += $(occa.cpp)
-  BACKENDS_MAKE += $(OCCA_BACKENDS)
-endif
-
 # CUDA Backends
 ifneq ($(CUDA_DIR),)
   CUDA_LIB_DIR := $(wildcard $(foreach d,lib lib64 lib/x86_64-linux-gnu,$(CUDA_DIR)/$d/libcudart.${SO_EXT}))
@@ -452,27 +562,34 @@ ifneq ($(CUDA_LIB_DIR),)
   PKG_STUBS_LIBS += -L$(CUDA_LIB_DIR_STUBS)
   LIBCEED_CONTAINS_CXX = 1
   libceed.c     += interface/ceed-cuda.c
-  libceed.c     += $(cuda.c) $(cuda-ref.c) $(cuda-shared.c) $(cuda-gen.c)
-  libceed.cpp   += $(cuda.cpp) $(cuda-ref.cpp) $(cuda-gen.cpp)
-  libceed.cu    += $(cuda-ref.cu) $(cuda-shared.cu) $(cuda-gen.cu)
+  libceed.c     += $(cuda-all.c)
+  libceed.cpp   += $(cuda-all.cpp)
+  libceed.cu    += $(cuda-all.cu)
   BACKENDS_MAKE += $(CUDA_BACKENDS)
 endif
 
 # HIP Backends
-HIP_LIB_DIR := $(wildcard $(foreach d,lib lib64,$(ROCM_DIR)/$d/libamdhip64.${SO_EXT}))
+HIP_LIB_DIR := $(wildcard $(foreach d,lib lib64,$(ROCM_DIR)/$d/lib${HIP_LIB_NAME}.${SO_EXT}))
 HIP_LIB_DIR := $(patsubst %/,%,$(dir $(firstword $(HIP_LIB_DIR))))
 HIP_BACKENDS = /gpu/hip/ref /gpu/hip/shared /gpu/hip/gen
 ifneq ($(HIP_LIB_DIR),)
-  HIPCONFIG_CPPFLAGS := $(subst =,,$(shell $(ROCM_DIR)/bin/hipconfig -C))
-  $(hip-all.c:%.c=$(OBJDIR)/%.o) $(hip-all.c:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS)
+  HIPCONFIG_CPPFLAGS := $(shell $(ROCM_DIR)/bin/hipconfig -C)
+  # chipStar hipconfig -C includes clang-only flags (--target=, --offload=, -nohipwrapperinc, --hip-path=);
+  # strip those out for gcc-compiled C sources, keeping -D/-I/-include flags
+  ifeq ($(HIP_LIB_NAME),CHIP)
+    HIPCONFIG_CPPFLAGS_C := $(filter-out --offload% -nohipwrapperinc --hip-path% --target%,$(HIPCONFIG_CPPFLAGS)) -I$(ROCM_DIR)/include
+  else
+    HIPCONFIG_CPPFLAGS_C := $(HIPCONFIG_CPPFLAGS)
+  endif
+  $(hip-all.c:%.c=$(OBJDIR)/%.o) $(hip-all.c:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C)
   ifneq ($(CXX), $(HIPCC))
-    $(hip-all.cpp:%.cpp=$(OBJDIR)/%.o) $(hip-all.cpp:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS)
+    $(hip-all.cpp:%.cpp=$(OBJDIR)/%.o) $(hip-all.cpp:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C)
   endif
-  PKG_LIBS += -L$(abspath $(HIP_LIB_DIR)) -lamdhip64 -lhipblas
+  PKG_LIBS += -L$(abspath $(HIP_LIB_DIR)) -l${HIP_LIB_NAME} -lhipblas
   LIBCEED_CONTAINS_CXX = 1
   libceed.c     += $(hip-all.c)
   libceed.cpp   += $(hip-all.cpp)
-  libceed.hip   += $(hip-ref.hip)
+  libceed.hip   += $(hip-all.hip)
   BACKENDS_MAKE += $(HIP_BACKENDS)
 endif
 
@@ -485,7 +602,7 @@ endif
 ifneq ($(SYCL_LIB_DIR),)
   PKG_LIBS += $(SYCL_FLAG) -lze_loader
   LIBCEED_CONTAINS_CXX = 1
-  libceed.sycl += $(sycl-core.cpp) $(sycl-ref.cpp) $(sycl-shared.cpp) $(sycl-gen.cpp)
+  libceed.sycl  += $(sycl-core.cpp) $(sycl-ref.cpp) $(sycl-shared.cpp) $(sycl-gen.cpp)
   BACKENDS_MAKE += $(SYCL_BACKENDS)
 endif
 
@@ -528,6 +645,11 @@ endif
 BACKENDS ?= $(BACKENDS_MAKE)
 export BACKENDS
 
+
+# ------------------------------------------------------------
+# Linker Flags
+# ------------------------------------------------------------
+
 _pkg_ldflags = $(filter -L%,$(PKG_LIBS))
 _pkg_ldlibs = $(filter-out -L%,$(PKG_LIBS))
 $(libceeds) : CEED_LDFLAGS += $(_pkg_ldflags) $(if $(STATIC),,$(_pkg_ldflags:-L%=-Wl,-rpath,%)) $(PKG_STUBS_LIBS)
@@ -539,13 +661,23 @@ endif
 
 pkgconfig-libs-private = $(PKG_LIBS)
 ifeq ($(LIBCEED_CONTAINS_CXX),1)
-  $(libceeds) : LINK = $(CXX)
+  ifneq ($(SYCL_LIB_DIR),)
+    $(libceeds) : LINK = $(SYCLCXX)
+    $(libceeds) : CEED_LDFLAGS += $(SYCLFLAGS)
+  else
+    $(libceeds) : LINK = $(CXX)
+  endif
   ifeq ($(STATIC),1)
     $(examples) $(tests) : CEED_LDLIBS += $(LIBCXX)
     pkgconfig-libs-private += $(LIBCXX)
   endif
 endif
 
+
+# ------------------------------------------------------------
+# Building core library components
+# ------------------------------------------------------------
+
 # File names *-weak.c contain weak symbol definitions, which must be listed last
 # when creating shared or static libraries.
 weak_last = $(filter-out %-weak.o,$(1)) $(filter %-weak.o,$(1))
@@ -560,7 +692,7 @@ $(libceed.a) : $(call weak_last,$(libceed.o)) | $$(@D)/.DIR
 	$(call quiet,AR) $(ARFLAGS) $@ $^
 
 $(OBJDIR)/%.o : $(CURDIR)/%.c | $$(@D)/.DIR
-	$(call quiet,CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $(abspath $<)
+	$(call quiet,CC) $(CPPFLAGS) $(CFLAGS) $(CONFIGFLAGS) -c -o $@ $(abspath $<)
 
 $(OBJDIR)/%.o : $(CURDIR)/%.cpp | $$(@D)/.DIR
 	$(call quiet,CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $(abspath $<)
@@ -578,7 +710,7 @@ $(OBJDIR)/%.o : $(CURDIR)/%.sycl.cpp | $$(@D)/.DIR
 	$(call quiet,SYCLCXX) $(SYCLFLAGS) $(CPPFLAGS) -c -o $@ $(abspath $<)
 
 $(OBJDIR)/%$(EXE_SUFFIX) : tests/%.c | $$(@D)/.DIR
-	$(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
+	$(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS) -I./tests/test-include
 
 $(OBJDIR)/%$(EXE_SUFFIX) : tests/%.f90 | $$(@D)/.DIR
 	$(call quiet,LINK.F) -DSOURCE_DIR='"$(abspath $(<D))/"' $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
@@ -586,14 +718,32 @@ $(OBJDIR)/%$(EXE_SUFFIX) : tests/%.f90 | $$(@D)/.DIR
 $(OBJDIR)/%$(EXE_SUFFIX) : examples/ceed/%.c | $$(@D)/.DIR
 	$(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
 
-$(OBJDIR)/%$(EXE_SUFFIX) : examples/ceed/%.f | $$(@D)/.DIR
+$(OBJDIR)/%$(EXE_SUFFIX) : examples/ceed/%.f90 | $$(@D)/.DIR
 	$(call quiet,LINK.F) -DSOURCE_DIR='"$(abspath $(<D))/"' $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
 
+
+# ------------------------------------------------------------
+# Building examples
+# ------------------------------------------------------------
+
+# deal.II
+# Note: Invoking deal.II's CMAKE build system here
+.NOPARALLEL: dealii
+dealii :
+	mkdir -p examples/deal.II/build
+	cmake -B examples/deal.II/build -S examples/deal.II -DDEAL_II_DIR=$(DEAL_II_DIR) -DCEED_DIR=$(PWD)
+	+$(call quiet,MAKE) -C examples/deal.II/build
+
+$(OBJDIR)/dealii-% : examples/deal.II/*.cc examples/deal.II/*.h $(libceed) dealii | $$(@D)/.DIR
+	cp examples/deal.II/build/$* $@
+
+# MFEM
 $(OBJDIR)/mfem-% : examples/mfem/%.cpp $(libceed) | $$(@D)/.DIR
 	+$(MAKE) -C examples/mfem CEED_DIR=`pwd` \
 	  MFEM_DIR="$(abspath $(MFEM_DIR))" CXX=$(CXX) $*
 	cp examples/mfem/$* $@
 
+# Nek5000
 # Note: Multiple Nek files cannot be built in parallel. The '+' here enables
 #       this single Nek bps file to be built in parallel with other examples,
 #       such as when calling `make prove-all -j2`.
@@ -602,13 +752,12 @@ $(OBJDIR)/nek-bps : examples/nek/bps/bps.usr examples/nek/nek-examples.sh $(libc
 	mv examples/nek/build/bps $(OBJDIR)/bps
 	cp examples/nek/nek-examples.sh $(OBJDIR)/nek-bps
 
-# Note: Invoking deal.II's CMAKE build system here
-$(OBJDIR)/dealii-bps : examples/deal.II/*.cc examples/deal.II/*.h $(libceed) | $$(@D)/.DIR
-	mkdir -p examples/deal.II/build
-	cmake -B examples/deal.II/build -S examples/deal.II -DDEAL_II_DIR=$(DEAL_II_DIR) -DCEED_DIR=$(PWD)
-	+$(call quiet,MAKE) -C examples/deal.II/build
-	cp examples/deal.II/build/bps $(OBJDIR)/dealii-bps
+# Rust QFunctions
+$(OBJDIR)/rustqfunctions-% : examples/rust-qfunctions/%.c $(libceed) | $$(@D)/.DIR
+	+$(MAKE) -C examples/rust-qfunctions CEED_DIR=`pwd`
+	cp examples/rust-qfunctions/$* $@
 
+# PETSc
 # Several executables have common utilities, but we can't build the utilities
 # from separate submake invocations because they'll compete with each
 # other/corrupt output. So we put it in this utility library, but we don't want
@@ -623,14 +772,13 @@ $(OBJDIR)/petsc-% : examples/petsc/%.c examples/petsc/libutils.a.PHONY $(libceed
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/petsc/$* $@
 
-$(OBJDIR)/fluids-% : examples/fluids/%.c examples/fluids/src/*.c examples/fluids/*.h examples/fluids/problems/*.c examples/fluids/qfunctions/*.h examples/fluids/src/smartsim/*.c $(libceed) $(ceed.pc) | $$(@D)/.DIR
+# Fluid dynamics proxy application
+$(OBJDIR)/fluids-% : examples/fluids/%.c examples/fluids/src/*.c examples/fluids/*.h examples/fluids/include/*.h examples/fluids/problems/*.c examples/fluids/qfunctions/*.h $(libceed) $(ceed.pc) examples/fluids/Makefile | $$(@D)/.DIR
 	+$(call quiet,MAKE) -C examples/fluids CEED_DIR=`pwd` \
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/fluids/$* $@
 
-$(OBJDIR)/fluids-py-% : examples/fluids/%.py $(OBJDIR)/fluids-navierstokes
-	cp $< $@
-
+# Solid mechanics proxy application
 $(OBJDIR)/solids-% : examples/solids/%.c examples/solids/%.h \
     examples/solids/problems/*.c examples/solids/src/*.c \
     examples/solids/include/*.h examples/solids/problems/*.h examples/solids/qfunctions/*.h \
@@ -639,10 +787,34 @@ $(OBJDIR)/solids-% : examples/solids/%.c examples/solids/%.h \
 	  PETSC_DIR="$(abspath $(PETSC_DIR))" OPT="$(OPT)" $*
 	cp examples/solids/$* $@
 
+examples      : $(allexamples)
+ceedexamples  : $(examples)
+nekexamples   : $(nekexamples)
+mfemexamples  : $(mfemexamples)
+petscexamples : $(petscexamples)
+
+rustqfunctionsexamples : $(rustqfunctionsexamples)
+
+external_examples := \
+	$(if $(MFEM_DIR),$(mfemexamples)) \
+	$(if $(PETSC_DIR),$(petscexamples)) \
+	$(if $(NEK5K_DIR),$(nekexamples)) \
+	$(if $(DEAL_II_DIR),$(dealiiexamples)) \
+	$(if $(PETSC_DIR),$(fluidsexamples)) \
+	$(if $(PETSC_DIR),$(solidsexamples)) \
+	$(if $(or $(RUST_QF),$(GPU_CLANG)),$(rustqfunctionsexamples))
+
+allexamples = $(examples) $(external_examples)
+
 $(examples) : $(libceed)
 $(tests) : $(libceed)
 $(tests) $(examples) : override LDFLAGS += $(if $(STATIC),,-Wl,-rpath,$(abspath $(LIBDIR))) -L$(LIBDIR)
 
+
+# ------------------------------------------------------------
+# Testing
+# ------------------------------------------------------------
+
 # Set number processes for testing
 NPROC_TEST ?= 1
 export NPROC_TEST
@@ -652,17 +824,7 @@ NPROC_POOL ?= 1
 export NPROC_POOL
 
 run-% : $(OBJDIR)/%
-	@$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) $(<:$(OBJDIR)/%=%)
-
-external_examples := \
-	$(if $(MFEM_DIR),$(mfemexamples)) \
-	$(if $(PETSC_DIR),$(petscexamples)) \
-	$(if $(NEK5K_DIR),$(nekexamples)) \
-	$(if $(DEAL_II_DIR),$(dealiiexamples)) \
-	$(if $(PETSC_DIR),$(fluidsexamples)) \
-	$(if $(PETSC_DIR),$(solidsexamples))
-
-allexamples = $(examples) $(external_examples)
+	@$(PYTHON) tests/junit.py --mode tap --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) --search '$(subsearch)' $(<:$(OBJDIR)/%=%)
 
 # The test and prove targets can be controlled via pattern searches.  The
 # default is to run tests and those examples that have no external dependencies.
@@ -674,6 +836,7 @@ allexamples = $(examples) $(external_examples)
 search ?= t ex
 realsearch = $(search:%=%%)
 matched = $(foreach pattern,$(realsearch),$(filter $(OBJDIR)/$(pattern),$(tests) $(allexamples)))
+subsearch ?= .*
 JUNIT_BATCH ?= ''
 
 # Test core libCEED
@@ -684,9 +847,11 @@ tst : ;@$(MAKE) $(MFLAGS) V=$(V) test
 # CPU C tests only for backend %
 ctc-% : $(ctests);@$(foreach tst,$(ctests),$(tst) /cpu/$*;)
 
+# Testing with TAP format
+# https://testanything.org/tap-specification.html
 prove : $(matched)
 	$(info Testing backends: $(BACKENDS))
-	$(PROVE) $(PROVE_OPTS) --exec 'tests/junit.py --mode tap --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL)' $(matched:$(OBJDIR)/%=%)
+	$(PROVE) $(PROVE_OPTS) --exec '$(PYTHON) tests/junit.py' $(matched:$(OBJDIR)/%=%) :: --mode tap --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) --search '$(subsearch)'
 # Run prove target in parallel
 prv : ;@$(MAKE) $(MFLAGS) V=$(V) prove
 
@@ -694,24 +859,18 @@ prove-all :
 	+$(MAKE) prove realsearch=%
 
 junit-% : $(OBJDIR)/%
-	@printf "  %10s %s\n" TEST $(<:$(OBJDIR)/%=%); $(PYTHON) tests/junit.py --ceed-backends $(BACKENDS) $(if $(SMARTREDIS_DIR),--smartredis_dir $(SMARTREDIS_DIR) )--nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) --junit-batch $(JUNIT_BATCH) $(<:$(OBJDIR)/%=%)
+	@printf "  %10s %s\n" TEST $(<:$(OBJDIR)/%=%); $(PYTHON) tests/junit.py --ceed-backends $(BACKENDS) --nproc $(NPROC_TEST) --pool-size $(NPROC_POOL) --search '$(subsearch)' --junit-batch $(JUNIT_BATCH) $(<:$(OBJDIR)/%=%)
 
 junit : $(matched:$(OBJDIR)/%=junit-%)
 
 all: $(alltests)
 
-examples : $(allexamples)
-ceedexamples : $(examples)
-nekexamples : $(nekexamples)
-mfemexamples : $(mfemexamples)
-petscexamples : $(petscexamples)
-
 # Benchmarks
 allbenchmarks = petsc-bps
 bench_targets = $(addprefix bench-,$(allbenchmarks))
 .PHONY: $(bench_targets) benchmarks
 $(bench_targets): bench-%: $(OBJDIR)/%
-	cd benchmarks && ./benchmark.sh --ceed "$(BACKENDS_MAKE)" -r $(*).sh
+	cd benchmarks && ./benchmark.sh --ceed "$(BACKENDS)" -r $(*).sh
 benchmarks: $(bench_targets)
 
 $(ceed.pc) : pkgconfig-prefix = $(abspath .)
@@ -720,11 +879,23 @@ $(OBJDIR)/ceed.pc : pkgconfig-prefix = $(prefix)
 %/ceed.pc : ceed.pc.template | $$(@D)/.DIR
 	@$(SED) \
 	    -e "s:%prefix%:$(pkgconfig-prefix):" \
+	    -e "s:%opt%:$(OPT):" \
 	    -e "s:%libs_private%:$(pkgconfig-libs-private):" $< > $@
 
+GIT_DESCRIBE = $(shell git -c safe.directory=$PWD describe --always --dirty 2>/dev/null || printf "unknown\n")
+
+$(OBJDIR)/interface/ceed-config.o: Makefile
+$(OBJDIR)/interface/ceed-config.o: CONFIGFLAGS += -DCEED_GIT_VERSION="\"$(GIT_DESCRIBE)\""
+$(OBJDIR)/interface/ceed-config.o: CONFIGFLAGS += -DCEED_BUILD_CONFIGURATION="\"// Build Configuration:$(foreach v,$(CONFIG_VARS),\n$(v) = $($(v)))\""
+
 $(OBJDIR)/interface/ceed-jit-source-root-default.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\""
 $(OBJDIR)/interface/ceed-jit-source-root-install.o : CPPFLAGS += -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath $(includedir))/\""
 
+
+# ------------------------------------------------------------
+# Installation
+# ------------------------------------------------------------
+
 install : $(libceed) $(OBJDIR)/ceed.pc
 	$(INSTALL) -d $(addprefix $(if $(DESTDIR),"$(DESTDIR)"),"$(includedir)"\
 	  "$(includedir)/ceed/" "$(includedir)/ceed/jit-source/"\
@@ -732,6 +903,7 @@ install : $(libceed) $(OBJDIR)/ceed.pc
 	  "$(includedir)/ceed/jit-source/gallery/" "$(includedir)/ceed/jit-source/magma/"\
 	  "$(includedir)/ceed/jit-source/sycl/" "$(libdir)" "$(pkgconfigdir)")
 	$(INSTALL_DATA) include/ceed/ceed.h "$(DESTDIR)$(includedir)/ceed/"
+	$(INSTALL_DATA) include/ceed/deprecated.h "$(DESTDIR)$(includedir)/ceed/"
 	$(INSTALL_DATA) include/ceed/types.h "$(DESTDIR)$(includedir)/ceed/"
 	$(INSTALL_DATA) include/ceed/ceed-f32.h "$(DESTDIR)$(includedir)/ceed/"
 	$(INSTALL_DATA) include/ceed/ceed-f64.h "$(DESTDIR)$(includedir)/ceed/"
@@ -749,18 +921,26 @@ install : $(libceed) $(OBJDIR)/ceed.pc
 	$(INSTALL_DATA) $(wildcard include/ceed/jit-source/magma/*.h) "$(DESTDIR)$(includedir)/ceed/jit-source/magma/"
 	$(INSTALL_DATA) $(wildcard include/ceed/jit-source/sycl/*.h) "$(DESTDIR)$(includedir)/ceed/jit-source/sycl/"
 
-.PHONY : all cln clean doxygen doc format lib install par print test tst prove prv prove-all junit examples tidy iwyu info info-backends info-backends-all
+
+# ------------------------------------------------------------
+# Cleaning
+# ------------------------------------------------------------
 
 cln clean :
 	$(RM) -r $(OBJDIR) $(LIBDIR) dist *egg* .pytest_cache *cffi*
 	$(call quiet,MAKE) -C examples clean NEK5K_DIR="$(abspath $(NEK5K_DIR))"
 	$(call quiet,MAKE) -C python/tests clean
 	$(RM) benchmarks/*output.txt
+	$(RM) -rf temp
 
 distclean : clean
 	$(RM) -r doc/html doc/sphinx/build $(CONFIG)
 
+
+# ------------------------------------------------------------
 # Documentation
+# ------------------------------------------------------------
+
 DOXYGEN ?= doxygen
 
 doxygen :
@@ -771,13 +951,18 @@ doc-html doc-latexpdf doc-epub doc-livehtml : doc-% : doxygen
 
 doc : doc-html
 
+
+# ------------------------------------------------------------
+# Linting utilities
+# ------------------------------------------------------------
+
 # Style/Format
 CLANG_FORMAT      ?= clang-format
 CLANG_FORMAT_OPTS += -style=file -i
 AUTOPEP8          ?= autopep8
 AUTOPEP8_OPTS     += --in-place --aggressive --max-line-length 120
 
-format.ch := $(filter-out include/ceedf.h $(wildcard tests/t*-f.h), $(shell git ls-files '*.[ch]pp' '*.[ch]'))
+format.ch := $(filter-out include/ceedf.h $(wildcard tests/t*-f.h) $(wildcard examples/ceed/ex*-f.h), $(shell git ls-files '*.[ch]pp' '*.[ch]' '*.cu'))
 format.py := $(filter-out tests/junit-xml/junit_xml/__init__.py, $(shell git ls-files '*.py'))
 format.ot := $(filter-out doc/sphinx/source/CODE_OF_CONDUCT.md doc/sphinx/source/CONTRIBUTING.md, $(shell git ls-files '*.md' '*.f90'))
 
@@ -794,7 +979,7 @@ format    : format-c format-py format-ot
 
 # Vermin - python version requirements
 VERMIN            ?= vermin
-VERMIN_OPTS       += -t=3.7- --violations
+VERMIN_OPTS       += -t=3.8- --violations
 
 vermin    :
 	$(VERMIN) $(VERMIN_OPTS) $(format.py)
@@ -803,10 +988,10 @@ vermin    :
 CLANG_TIDY ?= clang-tidy
 
 %.c.tidy : %.c
-	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c99 -I$(CUDA_DIR)/include -I$(ROCM_DIR)/include -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\""
+	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c11 -I$(CUDA_DIR)/include -I$(ROCM_DIR)/include -DCEED_JIT_SOURCE_ROOT_DEFAULT="\"$(abspath ./include)/\"" -DCEED_GIT_VERSION="\"$(GIT_DESCRIBE)\"" -DCEED_BUILD_CONFIGURATION="\"// Build Configuration:$(foreach v,$(CONFIG_VARS),\n$(v) = $($(v)))\""
 
 %.cpp.tidy : %.cpp
-	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c++11 -I$(CUDA_DIR)/include -I$(OCCA_DIR)/include -I$(ROCM_DIR)/include
+	$(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c++11 -I$(CUDA_DIR)/include -I$(ROCM_DIR)/include
 
 tidy-c   : $(libceed.c:%=%.tidy)
 tidy-cpp : $(libceed.cpp:%=%.tidy)
@@ -821,6 +1006,11 @@ endif
 iwyu :
 	$(MAKE) -B CC=$(IWYU_CC)
 
+
+# ------------------------------------------------------------
+# Variable printing for debugging
+# ------------------------------------------------------------
+
 print :
 	@echo $(VAR)=$($(VAR))
 
@@ -833,6 +1023,11 @@ print-% :
 	$(info )
 	@true
 
+
+# ------------------------------------------------------------
+# Configuration caching
+# ------------------------------------------------------------
+
 # "make configure" detects any variables passed on the command line or
 # previously set in config.mk, caching them in config.mk as simple
 # (:=) variables.  Variables set in config.mk or on the command line
@@ -852,7 +1047,7 @@ print-% :
 CONFIG_VARS = CC CXX FC NVCC NVCC_CXX HIPCC \
   OPT CFLAGS CPPFLAGS CXXFLAGS FFLAGS NVCCFLAGS HIPCCFLAGS SYCLFLAGS \
   AR ARFLAGS LDFLAGS LDLIBS LIBCXX SED \
-  MAGMA_DIR OCCA_DIR XSMM_DIR CUDA_DIR CUDA_ARCH MFEM_DIR PETSC_DIR NEK5K_DIR ROCM_DIR HIP_ARCH SYCL_DIR SMARTREDIS_DIR
+  MAGMA_DIR XSMM_DIR CUDA_DIR CUDA_ARCH MFEM_DIR PETSC_DIR NEK5K_DIR ROCM_DIR HIP_ARCH SYCL_DIR
 
 # $(call needs_save,CFLAGS) returns true (a nonempty string) if CFLAGS
 # was set on the command line or in config.mk (where it will appear as
@@ -865,6 +1060,11 @@ configure :
 	@echo "Configuration cached in $(CONFIG):"
 	@cat $(CONFIG)
 
+
+# ------------------------------------------------------------
+# Building Python wheels for deployment
+# ------------------------------------------------------------
+
 wheel : export MARCHFLAG = -march=generic
 wheel : export WHEEL_PLAT = manylinux2010_x86_64
 wheel :
@@ -872,7 +1072,13 @@ wheel :
 	  -e MARCHFLAG -e WHEEL_PLAT \
 	  quay.io/pypa/$(WHEEL_PLAT) python/make-wheels.sh
 
-.PHONY : configure wheel
+# ------------------------------------------------------------
+# Phony targets
+# ------------------------------------------------------------
+
+# These targets are not files but rather commands to run
+.PHONY : all cln clean doxygen doc format lib install par print test tst prove prv prove-all junit examples tidy iwyu info info-backends info-backends-all configure wheel
+
 
 # Include *.d deps when not -B = --always-make: useful if the paths are wonky in a container
 -include $(if $(filter B,$(MAKEFLAGS)),,$(libceed.c:%.c=$(OBJDIR)/%.d) $(tests.c:tests/%.c=$(OBJDIR)/%.d))
diff --git a/README.md b/README.md
index 7c2d32c0ea..9725a1422c 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ The CEED research is supported by the [Exascale Computing Project](https://exasc
 
 For more details on the CEED API see the [user manual](https://libceed.org/en/latest/).
 
-% gettingstarted-inclusion-marker
+<!-- getting-started-inclusion -->
 
 ## Building
 
@@ -183,13 +183,6 @@ There are multiple supported backends, which can be selected at runtime in the e
 | `/gpu/hip/magma`           | HIP MAGMA kernels                                 | No                    |
 | `/gpu/hip/magma/det`       | HIP MAGMA kernels                                 | Yes                   |
 ||
-| **OCCA**                   |
-| `/*/occa`                  | Selects backend based on available OCCA modes     | Yes                   |
-| `/cpu/self/occa`           | OCCA backend with serial CPU kernels              | Yes                   |
-| `/cpu/openmp/occa`         | OCCA backend with OpenMP kernels                  | Yes                   |
-| `/cpu/dpcpp/occa`          | OCCA backend with DPC++ kernels                   | Yes                   |
-| `/gpu/cuda/occa`           | OCCA backend with CUDA kernels                    | Yes                   |
-| `/gpu/hip/occa`            | OCCA backend with HIP kernels                     | Yes                   |
 
 The `/cpu/self/*/serial` backends process one element at a time and are intended for meshes with a smaller number of high order elements.
 The `/cpu/self/*/blocked` backends process blocked batches of eight interlaced elements and are intended for meshes with higher numbers of elements.
@@ -207,6 +200,7 @@ This backend can be run in serial or blocked mode and defaults to running in the
 
 The `/cpu/self/xsmm/*` backends rely upon the [LIBXSMM](https://github.com/libxsmm/libxsmm) package to provide vectorized CPU performance.
 If linking MKL and LIBXSMM is desired but the Makefile is not detecting `MKLROOT`, linking libCEED against MKL can be forced by setting the environment variable `MKL=1`.
+The LIBXSMM `main` development branch from 7 April 2024 or newer is required.
 
 The `/gpu/cuda/*` backends provide GPU performance strictly using CUDA.
 
@@ -229,31 +223,24 @@ For example:
 
 > - `/gpu/cuda/gen:device_id=1`
 
-The `/*/occa` backends rely upon the [OCCA](http://github.com/libocca/occa) package to provide cross platform performance.
-To enable the OCCA backend, the environment variable `OCCA_DIR` must point to the top-level OCCA directory, with the OCCA library located in the `${OCCA_DIR}/lib` (By default, `OCCA_DIR` is set to `../occa`).
-OCCA version 1.4.0 or newer is required.
-
-Users can pass specific OCCA device properties after setting the CEED resource.
-For example:
-
-> - `"/*/occa:mode='CUDA',device_id=0"`
-
 Bit-for-bit reproducibility is important in some applications.
 However, some libCEED backends use non-deterministic operations, such as `atomicAdd` for increased performance.
 The backends which are capable of generating reproducible results, with the proper compilation options, are highlighted in the list above.
 
+<!-- getting-started-exclusion -->
+
 ## Examples
 
 libCEED comes with several examples of its usage, ranging from standalone C codes in the `/examples/ceed` directory to examples based on external packages, such as MFEM, PETSc, and Nek5000.
 Nek5000 v18.0 or greater is required.
 
-To build the examples, set the `MFEM_DIR`, `PETSC_DIR`, and `NEK5K_DIR` variables and run:
+To build the examples, set the `MFEM_DIR`, `PETSC_DIR` (and optionally `PETSC_ARCH`), and `NEK5K_DIR` variables and run:
 
 ```console
 $ cd examples/
 ```
 
-% running-examples-inclusion-marker
+<!-- running-examples-inclusion -->
 
 ```console
 # libCEED examples on CPU and GPU
@@ -336,7 +323,7 @@ The above code assumes a GPU-capable machine with the CUDA backends enabled.
 Depending on the available backends, other CEED resource specifiers can be provided with the `-ceed` option.
 Other command line arguments can be found in [examples/petsc](https://github.com/CEED/libCEED/blob/main/examples/petsc/README.md).
 
-% benchmarks-marker
+<!-- running-examples-exclusion -->
 
 ## Benchmarks
 
@@ -414,7 +401,22 @@ If you utilize libCEED please cite:
 
 ```bibtex
 @article{libceed-joss-paper,
-  author       = {Jed Brown and Ahmad Abdelfattah and Valeria Barra and Natalie Beams and Jean Sylvain Camier and Veselin Dobrev and Yohann Dudouit and Leila Ghaffari and Tzanio Kolev and David Medina and Will Pazner and Thilina Ratnayaka and Jeremy Thompson and Stan Tomov},
+  author       = {
+    Brown, Jed and
+    Abdelfattah, Ahmad and
+    Barra, Valeria and
+    Beams, Natalie and
+    Camier, Jean-Sylvain and
+    Dobrev, Veselin and
+    Dudouit, Yohann and
+    Ghaffari, Leila and
+    Kolev, Tzanio and
+    Medina, David and
+    Pazner, Will and
+    Ratnayaka, Thilina and
+    Thompson, Jeremy L. and
+    Tomov, Stan
+  },
   title        = {{libCEED}: Fast algebra for high-order element-based discretizations},
   journal      = {Journal of Open Source Software},
   year         = {2021},
@@ -431,23 +433,25 @@ To cite the user manual:
 
 ```bibtex
 @misc{libceed-user-manual,
-  author       = {Abdelfattah, Ahmad and
-                  Barra, Valeria and
-                  Beams, Natalie and
-                  Brown, Jed and
-                  Camier, Jean-Sylvain and
-                  Dobrev, Veselin and
-                  Dudouit, Yohann and
-                  Ghaffari, Leila and
-                  Grimberg, Sebastian and
-                  Kolev, Tzanio and
-                  Medina, David and
-                  Pazner, Will and
-                  Ratnayaka, Thilina and
-                  Shakeri, Rezgar and
-                  Thompson, Jeremy L and
-                  Tomov, Stanimire and
-                  Wright III, James},
+  author       = {
+    Abdelfattah, Ahmad and
+    Barra, Valeria and
+    Beams, Natalie and
+    Brown, Jed and
+    Camier, Jean-Sylvain and
+    Dobrev, Veselin and
+    Dudouit, Yohann and
+    Ghaffari, Leila and
+    Grimberg, Sebastian and
+    Kolev, Tzanio and
+    Medina, David and
+    Pazner, Will and
+    Ratnayaka, Thilina and
+    Shakeri, Rezgar and
+    Thompson, Jeremy L. and
+    Tomov, Stanimire and
+    Wright III, James
+  },
   title        = {{libCEED} User Manual},
   month        = nov,
   year         = 2023,
@@ -460,9 +464,14 @@ To cite the user manual:
 For libCEED's Python interface please cite:
 
 ```bibtex
-@InProceedings{libceed-paper-proc-scipy-2020,
-  author    = {{V}aleria {B}arra and {J}ed {B}rown and {J}eremy {T}hompson and {Y}ohann {D}udouit},
-  title     = {{H}igh-performance operator evaluations with ease of use: lib{C}{E}{E}{D}'s {P}ython interface},
+@InProceedings{libceed-scipy,
+  author    = {
+    Barra, Valeria and
+    Brown, Jed and
+    Thompson, Jeremy L. and
+    Dudouit, Yohann
+  },
+  title     = {{H}igh-performance operator evaluations with ease of use: {libCEED}'s {P}ython interface},
   booktitle = {{P}roceedings of the 19th {P}ython in {S}cience {C}onference},
   pages     = {85 - 90},
   year      = {2020},
@@ -477,7 +486,7 @@ The BibTeX entries for these references can be found in the `doc/bib/references.
 
 The following copyright applies to each file in the CEED software suite, unless otherwise stated in the file:
 
-> Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+> Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 > All rights reserved.
 
 See files LICENSE and NOTICE for details.
diff --git a/backends/avx/ceed-avx-blocked.c b/backends/avx/ceed-avx-blocked.c
index c565faa653..8452fd8591 100644
--- a/backends/avx/ceed-avx-blocked.c
+++ b/backends/avx/ceed-avx-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -25,6 +25,7 @@ static int CeedInit_Avx(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Avx));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/avx/ceed-avx-serial.c b/backends/avx/ceed-avx-serial.c
index 5ebe28e19a..06ed5f9fdb 100644
--- a/backends/avx/ceed-avx-serial.c
+++ b/backends/avx/ceed-avx-serial.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -25,6 +25,7 @@ static int CeedInit_Avx(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Avx));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/avx/ceed-avx-tensor.c b/backends/avx/ceed-avx-tensor.c
index cd22249e83..40d5df0646 100644
--- a/backends/avx/ceed-avx-tensor.c
+++ b/backends/avx/ceed-avx-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -10,7 +10,7 @@
 #include <immintrin.h>
 #include <stdbool.h>
 
-#ifdef CEED_F64_H
+#ifdef CEED_SCALAR_IS_FP64
 #define rtype __m256d
 #define loadu _mm256_loadu_pd
 #define storeu _mm256_storeu_pd
diff --git a/backends/avx/ceed-avx.h b/backends/avx/ceed-avx.h
index 786be45a0d..cb151baa85 100644
--- a/backends/avx/ceed-avx.h
+++ b/backends/avx/ceed-avx.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c
index d1f2678e1d..0161c2819f 100644
--- a/backends/blocked/ceed-blocked-operator.c
+++ b/backends/blocked/ceed-blocked-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,9 +16,9 @@
 //------------------------------------------------------------------------------
 // Setup Input/Output Fields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, const CeedInt block_size,
-                                           CeedElemRestriction *block_rstr, CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs,
-                                           CeedInt start_e, CeedInt num_fields, CeedInt Q) {
+static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, CeedInt *e_data_out_indices,
+                                           bool *apply_add_basis, const CeedInt block_size, CeedElemRestriction *block_rstr, CeedVector *e_vecs_full,
+                                           CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
   Ceed                ceed;
   CeedSize            e_size, q_size;
   CeedInt             num_comp, size, P;
@@ -30,7 +30,8 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
 
     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
     CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
-    if (ceed_parent) ceed = ceed_parent;
+    CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed));
+    CeedCallBackend(CeedDestroy(&ceed_parent));
   }
   if (is_input) {
     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
@@ -101,10 +102,14 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
           CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed_rstr, num_elem, elem_size, block_size, num_comp, l_size, strides,
                                                                   &block_rstr[i + start_e]));
         } break;
+        // LCOV_EXCL_START
         case CEED_RESTRICTION_POINTS:
           // Empty case - won't occur
           break;
+          // LCOV_EXCL_STOP
       }
+      CeedCallBackend(CeedDestroy(&ceed_rstr));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
       CeedCallBackend(CeedElemRestrictionCreateVector(block_rstr[i + start_e], NULL, &e_vecs_full[i + start_e]));
     }
 
@@ -122,6 +127,7 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
         CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         e_size = (CeedSize)P * num_comp * block_size;
         CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i]));
         q_size = (CeedSize)Q * size * block_size;
@@ -132,9 +138,63 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
         q_size = (CeedSize)Q * block_size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
     }
   }
+  // Drop duplicate restrictions
+  if (is_input) {
+    for (CeedInt i = 0; i < num_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j] = true;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  } else {
+    for (CeedInt i = num_fields - 1; i >= 0; i--) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i - 1; j >= 0; j--) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j]          = true;
+          apply_add_basis[i]    = true;
+          e_data_out_indices[j] = i;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -143,7 +203,6 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
 //------------------------------------------------------------------------------
 static int CeedOperatorSetup_Blocked(CeedOperator op) {
   bool                  is_setup_done;
-  Ceed                  ceed;
   CeedInt               Q, num_input_fields, num_output_fields;
   const CeedInt         block_size = 8;
   CeedQFunctionField   *qf_input_fields, *qf_output_fields;
@@ -154,7 +213,6 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) {
   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
   if (is_setup_done) return CEED_ERROR_SUCCESS;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -166,6 +224,10 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) {
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->block_rstr));
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));
 
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_data_out_indices));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
@@ -177,11 +239,12 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) {
 
   // Set up infield and outfield pointer arrays
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, true, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0,
-                                                  num_input_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, true, impl->skip_rstr_in, NULL, NULL, block_size, impl->block_rstr, impl->e_vecs_full,
+                                                  impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out,
-                                                  num_input_fields, num_output_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, impl->skip_rstr_out, impl->e_data_out_indices, impl->apply_add_basis_out, block_size,
+                                                  impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields,
+                                                  num_output_fields, Q));
 
   // Identity QFunctions
   if (impl->is_identity_qf) {
@@ -200,6 +263,7 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) {
   }
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -210,13 +274,15 @@ static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, Ceed
                                                   CeedVector in_vec, bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX],
                                                   CeedOperator_Blocked *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool         is_active;
     uint64_t     state;
     CeedEvalMode eval_mode;
     CeedVector   vec;
 
     // Get input vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) {
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
       if (skip_active) continue;
       else vec = in_vec;
     }
@@ -226,13 +292,14 @@ static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, Ceed
     } else {
       // Restrict
       CeedCallBackend(CeedVectorGetState(vec, &state));
-      if (state != impl->input_states[i] || vec == in_vec) {
+      if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
         CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request));
-        impl->input_states[i] = state;
       }
+      impl->input_states[i] = state;
       // Get evec
       CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data_full[i]));
     }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -251,15 +318,19 @@ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunc
 
     // Skip active input
     if (skip_active) {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) continue;
     }
 
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Basis action
@@ -275,6 +346,7 @@ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunc
         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
         CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][(CeedSize)e * elem_size * num_comp]));
         CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:
         break;  // No action
@@ -287,8 +359,8 @@ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunc
 // Output Basis Action
 //------------------------------------------------------------------------------
 static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields,
-                                                  CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op,
-                                                  CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) {
+                                                  CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, bool *apply_add_basis,
+                                                  CeedOperator op, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedInt             elem_size, num_comp;
     CeedEvalMode        eval_mode;
@@ -298,6 +370,7 @@ static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFun
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     // Basis action
     switch (eval_mode) {
@@ -311,7 +384,12 @@ static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFun
         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
         CeedCallBackend(CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER,
                                            &e_data_full[i + num_input_fields][(CeedSize)e * elem_size * num_comp]));
-        CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        if (apply_add_basis[i]) {
+          CeedCallBackend(CeedBasisApplyAdd(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -333,10 +411,13 @@ static inline int CeedOperatorRestoreInputs_Blocked(CeedInt num_input_fields, Ce
 
     // Skip active inputs
     if (skip_active) {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) continue;
     }
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
@@ -360,30 +441,34 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
   CeedOperatorField    *op_input_fields, *op_output_fields;
   CeedOperator_Blocked *impl;
 
-  CeedCallBackend(CeedOperatorGetData(op, &impl));
-  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size);
-
   // Setup
   CeedCallBackend(CeedOperatorSetup_Blocked(op));
 
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+
   // Restriction only operator
   if (impl->is_identity_rstr_op) {
     CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[0], CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request));
     CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[1], CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request));
     return CEED_ERROR_SUCCESS;
   }
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size);
 
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data_full, impl, request));
 
   // Output Evecs
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields]));
+  for (CeedInt i = num_output_fields - 1; i >= 0; i--) {
+    if (impl->skip_rstr_out[i]) {
+      e_data_full[i + num_input_fields] = e_data_full[impl->e_data_out_indices[i] + num_input_fields];
+    } else {
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields]));
+    }
   }
 
   // Loop through elements
@@ -393,8 +478,8 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       if (eval_mode == CEED_EVAL_NONE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
-        CeedCallBackend(
-            CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][(CeedSize)e * Q * size]));
+        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER,
+                                           &e_data_full[i + num_input_fields][(CeedSize)e * Q * size]));
       }
     }
 
@@ -407,27 +492,32 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
     }
 
     // Output basis apply
-    CeedCallBackend(CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields, op,
-                                                    e_data_full, impl));
+    CeedCallBackend(CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields,
+                                                    impl->apply_add_basis_out, op, e_data_full, impl));
   }
 
   // Output restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool       is_active;
     CeedVector vec;
 
+    if (impl->skip_rstr_out[i]) continue;
     // Restore evec
     CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_full[i + impl->num_inputs], &e_data_full[i + num_input_fields]));
     // Get output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
     // Active
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
+    if (is_active) vec = out_vec;
     // Restrict
-    CeedCallBackend(
-        CeedElemRestrictionApply(impl->block_rstr[i + impl->num_inputs], CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request));
+    CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i + impl->num_inputs], CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec,
+                                             request));
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Restore input arrays
   CeedCallBackend(CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, false, e_data_full, impl));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -437,8 +527,7 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed
 static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator op, bool build_objects, CeedVector *assembled,
                                                                   CeedElemRestriction *rstr, CeedRequest *request) {
   Ceed                  ceed;
-  CeedSize              q_size;
-  CeedInt               Q, num_input_fields, num_output_fields, num_elem, size;
+  CeedInt               qf_size_in, qf_size_out, Q, num_input_fields, num_output_fields, num_elem;
   const CeedInt         block_size = 8;
   CeedScalar           *l_vec_array;
   CeedScalar           *e_data_full[2 * CEED_FIELD_MAX] = {0};
@@ -448,8 +537,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
   CeedOperator_Blocked *impl;
 
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  CeedInt             num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
-  CeedVector         *active_in  = impl->qf_active_in;
+  qf_size_in                     = impl->qf_size_in;
+  qf_size_out                    = impl->qf_size_out;
   CeedVector          l_vec      = impl->qf_l_vec;
   CeedElemRestriction block_rstr = impl->qf_block_rstr;
 
@@ -471,55 +560,45 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
   CeedCallBackend(CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request));
 
   // Count number of active input fields
-  if (!num_active_in) {
+  if (qf_size_in == 0) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      CeedScalar *q_vec_array;
-      CeedVector  vec;
+      CeedInt    field_size;
+      CeedVector vec;
 
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array));
-        CeedCallBackend(CeedRealloc(num_active_in + size, &active_in));
-        for (CeedInt field = 0; field < size; field++) {
-          q_size = (CeedSize)Q * block_size;
-          CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field]));
-          CeedCallBackend(
-              CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * Q * block_size]));
-        }
-        num_active_in += size;
-        CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
+        qf_size_in += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
-    impl->num_active_in = num_active_in;
-    impl->qf_active_in  = active_in;
+    CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_in = qf_size_in;
   }
 
   // Count number of active output fields
-  if (!num_active_out) {
+  if (qf_size_out == 0) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedInt    field_size;
       CeedVector vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
-        num_active_out += size;
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+        qf_size_out += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
-    impl->num_active_out = num_active_out;
+    CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_out = qf_size_out;
   }
 
-  // Check sizes
-  CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-
   // Setup Lvec
   if (!l_vec) {
-    const CeedSize l_size = (CeedSize)num_blocks * block_size * Q * num_active_in * num_active_out;
+    const CeedSize l_size = (CeedSize)num_blocks * block_size * Q * qf_size_in * qf_size_out;
 
     CeedCallBackend(CeedVectorCreate(ceed, l_size, &l_vec));
     impl->qf_l_vec = l_vec;
@@ -528,21 +607,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
 
   // Setup block restriction
   if (!block_rstr) {
-    const CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q};
+    const CeedInt strides[3] = {1, Q, qf_size_in * qf_size_out * Q};
 
-    CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, num_active_in * num_active_out,
-                                                            num_active_in * num_active_out * num_elem * Q, strides, &block_rstr));
+    CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, qf_size_in * qf_size_out,
+                                                            qf_size_in * qf_size_out * num_elem * Q, strides, &block_rstr));
     impl->qf_block_rstr = block_rstr;
   }
 
   // Build objects if needed
   if (build_objects) {
-    const CeedSize l_size     = (CeedSize)num_elem * Q * num_active_in * num_active_out;
-    const CeedInt  strides[3] = {1, Q, num_active_in * num_active_out * Q};
+    const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
+    const CeedInt  strides[3] = {1, Q, qf_size_in * qf_size_out * Q};
 
     // Create output restriction
-    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, num_active_in * num_active_out, num_active_in * num_active_out * num_elem * Q,
-                                                     strides, rstr));
+    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out,
+                                                     (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
   }
@@ -553,37 +632,64 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
     CeedCallBackend(CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, true, e_data_full, impl));
 
     // Assemble QFunction
-    for (CeedInt in = 0; in < num_active_in; in++) {
-      // Set Inputs
-      CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0));
-      if (num_active_in > 1) {
-        CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0));
-      }
-      if (!impl->is_identity_qf) {
-        // Set Outputs
-        for (CeedInt out = 0; out < num_output_fields; out++) {
-          CeedVector vec;
-
-          // Get output vector
-          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
-          // Check if active output
-          if (vec == CEED_VECTOR_ACTIVE) {
-            CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array));
-            CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
-            l_vec_array += size * Q * block_size;  // Advance the pointer by the size of the output
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool       is_active;
+      CeedInt    field_size;
+      CeedVector vec;
+
+      // Check if active input
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active) continue;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
+      for (CeedInt field = 0; field < field_size; field++) {
+        // Set current portion of input to 1.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < Q * block_size; j++) array[field * Q * block_size + j] = 1.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
+        }
+
+        if (!impl->is_identity_qf) {
+          // Set Outputs
+          for (CeedInt out = 0; out < num_output_fields; out++) {
+            CeedInt    field_size;
+            CeedVector vec;
+
+            // Get output vector
+            CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
+            // Check if active output
+            if (vec == CEED_VECTOR_ACTIVE) {
+              CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array));
+              CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
+              l_vec_array += field_size * Q * block_size;  // Advance the pointer by the size of the output
+            }
+            CeedCallBackend(CeedVectorDestroy(&vec));
           }
+          // Apply QFunction
+          CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out));
+        } else {
+          CeedInt           field_size;
+          const CeedScalar *array;
+
+          // Copy Identity Outputs
+          CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size));
+          CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < field_size * Q * block_size; j++) l_vec_array[j] = array[j];
+          CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &array));
+          l_vec_array += field_size * Q * block_size;
+        }
+        // Reset input to 0.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < Q * block_size; j++) array[field * Q * block_size + j] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
         }
-        // Apply QFunction
-        CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out));
-      } else {
-        const CeedScalar *q_vec_array;
-
-        // Copy Identity Outputs
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &size));
-        CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &q_vec_array));
-        for (CeedInt i = 0; i < size * Q * block_size; i++) l_vec_array[i] = q_vec_array[i];
-        CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &q_vec_array));
-        l_vec_array += size * Q * block_size;
       }
     }
   }
@@ -593,12 +699,12 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
     for (CeedInt out = 0; out < num_output_fields; out++) {
       CeedVector vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
       // Check if active output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -609,6 +715,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o
   CeedCallBackend(CeedVectorRestoreArray(l_vec, &l_vec_array));
   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
   CeedCallBackend(CeedElemRestrictionApply(block_rstr, CEED_TRANSPOSE, l_vec, *assembled, request));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -634,6 +742,10 @@ static int CeedOperatorDestroy_Blocked(CeedOperator op) {
 
   CeedCallBackend(CeedOperatorGetData(op, &impl));
 
+  CeedCallBackend(CeedFree(&impl->skip_rstr_in));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_out));
+  CeedCallBackend(CeedFree(&impl->e_data_out_indices));
+  CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
     CeedCallBackend(CeedElemRestrictionDestroy(&impl->block_rstr[i]));
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i]));
@@ -657,10 +769,6 @@ static int CeedOperatorDestroy_Blocked(CeedOperator op) {
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
 
   // QFunction assembly data
-  for (CeedInt i = 0; i < impl->num_active_in; i++) {
-    CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
-  }
-  CeedCallBackend(CeedFree(&impl->qf_active_in));
   CeedCallBackend(CeedVectorDestroy(&impl->qf_l_vec));
   CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_block_rstr));
 
@@ -682,6 +790,7 @@ int CeedOperatorCreate_Blocked(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Blocked));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Blocked));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Blocked));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/blocked/ceed-blocked.c b/backends/blocked/ceed-blocked.c
index f37338f0d6..f50d2fc91e 100644
--- a/backends/blocked/ceed-blocked.c
+++ b/backends/blocked/ceed-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -25,6 +25,7 @@ static int CeedInit_Blocked(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Blocked));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/blocked/ceed-blocked.h b/backends/blocked/ceed-blocked.h
index 917f4eb604..e1976d6e43 100644
--- a/backends/blocked/ceed-blocked.h
+++ b/backends/blocked/ceed-blocked.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -17,16 +17,17 @@ typedef struct {
 
 typedef struct {
   bool                 is_identity_qf, is_identity_rstr_op;
-  CeedElemRestriction *block_rstr;   /* Blocked versions of restrictions */
-  CeedVector          *e_vecs_full;  /* Full E-vectors, inputs followed by outputs */
+  bool                *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
+  CeedInt             *e_data_out_indices;
   uint64_t            *input_states; /* State counter of inputs */
+  CeedVector          *e_vecs_full;  /* Full E-vectors, inputs followed by outputs */
   CeedVector          *e_vecs_in;    /* Element block input E-vectors  */
   CeedVector          *e_vecs_out;   /* Element block output E-vectors */
   CeedVector          *q_vecs_in;    /* Element block input Q-vectors  */
   CeedVector          *q_vecs_out;   /* Element block output Q-vectors */
+  CeedElemRestriction *block_rstr;   /* Blocked versions of restrictions */
   CeedInt              num_inputs, num_outputs;
-  CeedInt              num_active_in, num_active_out;
-  CeedVector          *qf_active_in;
+  CeedInt              qf_size_in, qf_size_out;
   CeedVector           qf_l_vec;
   CeedElemRestriction  qf_block_rstr;
 } CeedOperator_Blocked;
diff --git a/backends/ceed-backend-list-avx.h b/backends/ceed-backend-list-avx.h
new file mode 100644
index 0000000000..5e19a016c7
--- /dev/null
+++ b/backends/ceed-backend-list-avx.h
@@ -0,0 +1,13 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Avx_Blocked, 1, "/cpu/self/avx/blocked")
+CEED_BACKEND(CeedRegister_Avx_Serial, 1, "/cpu/self/avx/serial")
diff --git a/backends/ceed-backend-list-cuda.h b/backends/ceed-backend-list-cuda.h
new file mode 100644
index 0000000000..87593f5b08
--- /dev/null
+++ b/backends/ceed-backend-list-cuda.h
@@ -0,0 +1,14 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Cuda, 1, "/gpu/cuda/ref")
+CEED_BACKEND(CeedRegister_Cuda_Gen, 1, "/gpu/cuda/gen")
+CEED_BACKEND(CeedRegister_Cuda_Shared, 1, "/gpu/cuda/shared")
diff --git a/backends/ceed-backend-list-hip.h b/backends/ceed-backend-list-hip.h
new file mode 100644
index 0000000000..e66fc98298
--- /dev/null
+++ b/backends/ceed-backend-list-hip.h
@@ -0,0 +1,14 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Hip, 1, "/gpu/hip/ref")
+CEED_BACKEND(CeedRegister_Hip_Gen, 1, "/gpu/hip/gen")
+CEED_BACKEND(CeedRegister_Hip_Shared, 1, "/gpu/hip/shared")
diff --git a/backends/ceed-backend-list-magma.h b/backends/ceed-backend-list-magma.h
new file mode 100644
index 0000000000..66c985c884
--- /dev/null
+++ b/backends/ceed-backend-list-magma.h
@@ -0,0 +1,13 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma")
+CEED_BACKEND(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det")
diff --git a/backends/ceed-backend-list-memcheck.h b/backends/ceed-backend-list-memcheck.h
new file mode 100644
index 0000000000..fa6f51b0bb
--- /dev/null
+++ b/backends/ceed-backend-list-memcheck.h
@@ -0,0 +1,13 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked")
+CEED_BACKEND(CeedRegister_Memcheck_Serial, 1, "/cpu/self/memcheck/serial")
diff --git a/backends/ceed-backend-list-ref.h b/backends/ceed-backend-list-ref.h
new file mode 100644
index 0000000000..ac3e21525d
--- /dev/null
+++ b/backends/ceed-backend-list-ref.h
@@ -0,0 +1,15 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
+CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
+CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
+CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
diff --git a/backends/ceed-backend-list-sycl.h b/backends/ceed-backend-list-sycl.h
new file mode 100644
index 0000000000..88617e1b2b
--- /dev/null
+++ b/backends/ceed-backend-list-sycl.h
@@ -0,0 +1,14 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Sycl, 1, "/gpu/sycl/ref")
+CEED_BACKEND(CeedRegister_Sycl_Shared, 1, "/gpu/sycl/shared")
+CEED_BACKEND(CeedRegister_Sycl_Gen, 1, "/gpu/sycl/gen")
diff --git a/backends/ceed-backend-list-xsmm.h b/backends/ceed-backend-list-xsmm.h
new file mode 100644
index 0000000000..fee5f81102
--- /dev/null
+++ b/backends/ceed-backend-list-xsmm.h
@@ -0,0 +1,13 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// This header does not have guards because it is included multiple times.
+// This will be expanded inside CeedRegisterAll() to call each registration function.
+// This is also used to create weakly linked registration functions in `backends/weak/ceed-*-weak.c'.
+
+CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
+CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
diff --git a/backends/ceed-backend-list.h b/backends/ceed-backend-list.h
index 75b1d1fe75..77f8e34490 100644
--- a/backends/ceed-backend-list.h
+++ b/backends/ceed-backend-list.h
@@ -1,35 +1,29 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-// This header does not have guards because it is included multiple times.
+// This header does not have guards because it may be included multiple times.
 
-// List each backend registration function once here.
-// This will be expanded inside CeedRegisterAll() to call each registration function in the order listed, and also to define weak symbol aliases for
-// backends that are not configured.
+// List each backend registration function in the corresponding `ceed-backend-list-*.h` file, grouped by install requirement.
+// Include each of those files here.
+// This will be expanded inside CeedRegisterAll() to call each registration function in the order listed.
 
-CEED_BACKEND(CeedRegister_Avx_Blocked, 1, "/cpu/self/avx/blocked")
-CEED_BACKEND(CeedRegister_Avx_Serial, 1, "/cpu/self/avx/serial")
-CEED_BACKEND(CeedRegister_Cuda, 1, "/gpu/cuda/ref")
-CEED_BACKEND(CeedRegister_Cuda_Gen, 1, "/gpu/cuda/gen")
-CEED_BACKEND(CeedRegister_Cuda_Shared, 1, "/gpu/cuda/shared")
-CEED_BACKEND(CeedRegister_Hip, 1, "/gpu/hip/ref")
-CEED_BACKEND(CeedRegister_Hip_Gen, 1, "/gpu/hip/gen")
-CEED_BACKEND(CeedRegister_Hip_Shared, 1, "/gpu/hip/shared")
-CEED_BACKEND(CeedRegister_Sycl, 1, "/gpu/sycl/ref")
-CEED_BACKEND(CeedRegister_Sycl_Shared, 1, "/gpu/sycl/shared")
-CEED_BACKEND(CeedRegister_Sycl_Gen, 1, "/gpu/sycl/gen")
-CEED_BACKEND(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma")
-CEED_BACKEND(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det")
-CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked")
-CEED_BACKEND(CeedRegister_Memcheck_Serial, 1, "/cpu/self/memcheck/serial")
-CEED_BACKEND(CeedRegister_Occa, 6, "/cpu/self/occa", "/cpu/openmp/occa", "/gpu/dpcpp/occa", "/gpu/opencl/occa", "/gpu/hip/occa", "/gpu/cuda/occa")
-CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
-CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
-CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
-CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
-CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
-CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
+// Always compiled
+#include "ceed-backend-list-ref.h"
+// Requires AVX support
+#include "ceed-backend-list-avx.h"
+// Requires Valgrind
+#include "ceed-backend-list-memcheck.h"
+// Requires LIBXSMM
+#include "ceed-backend-list-xsmm.h"
+// Requires CUDA
+#include "ceed-backend-list-cuda.h"
+// Requires ROCm
+#include "ceed-backend-list-hip.h"
+// Requires SYCL
+#include "ceed-backend-list-sycl.h"
+// Requires MAGMA + (CUDA or ROCm)
+#include "ceed-backend-list-magma.h"
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 8b2a8dfee5..7a36364d97 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -9,6 +9,7 @@
 
 #include <ceed.h>
 #include <ceed/backend.h>
+#include <ceed/gen-tools.h>
 #include <ceed/jit-tools.h>
 #include <cuda_runtime.h>
 
@@ -22,360 +23,466 @@
 #include "../cuda/ceed-cuda-compile.h"
 #include "ceed-cuda-gen.h"
 
+struct FieldReuse_Cuda {
+  CeedInt      index;
+  bool         is_input;
+  CeedEvalMode eval_mode;
+};
+
 //------------------------------------------------------------------------------
-// Build single operator kernel
+// Determine type of operator
 //------------------------------------------------------------------------------
-extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
-  using std::ostringstream;
-  using std::string;
-
-  bool                      is_setup_done, is_identity_qf;
-  struct cudaDeviceProp     prop;
-  Ceed                      ceed;
-  Ceed_Cuda                *ceed_data;
-  CeedSize                  l_size;
-  CeedInt                   Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1;
-  CeedEvalMode              eval_mode;
-  CeedElemRestriction       elem_rstr;
-  CeedElemRestriction_Cuda *rstr_data;
-  CeedBasis                 basis;
-  CeedBasis_Cuda_shared    *basis_data;
-  CeedQFunctionField       *qf_input_fields, *qf_output_fields;
-  CeedQFunction_Cuda_gen   *qf_data;
-  CeedQFunction             qf;
-  CeedOperatorField        *op_input_fields, *op_output_fields;
-  CeedOperator_Cuda_gen    *data;
-
-  CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
-  if (is_setup_done) return CEED_ERROR_SUCCESS;
-
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &data));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
-  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-  Q_1d = Q;
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-
-  // TODO: put in a function?
-  // Check for restriction only identity operator
-  CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
-  if (is_identity_qf) {
-    CeedEvalMode eval_mode_in, eval_mode_out;
-
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
-    CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
-              "Backend does not implement restriction only identity operators");
-  }
-
-  ostringstream code;
+static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
+                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
+                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
+                                                CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
+  // Check if all are tensor
+  *is_all_tensor = true;
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedBasis basis;
 
-  // TODO: put in a function?
-  // Add atomicAdd function for old NVidia architectures
-  CeedCallBackend(CeedGetData(ceed, &ceed_data));
-  CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
-  if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
-    char       *atomic_add_source;
-    const char *atomic_add_path;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+    if (basis != CEED_BASIS_NONE) {
+      bool is_field_tensor;
 
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Atomic Add Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source));
-    code << atomic_add_source;
-    CeedCallBackend(CeedFree(&atomic_add_path));
-    CeedCallBackend(CeedFree(&atomic_add_source));
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      *is_all_tensor = *is_all_tensor && is_field_tensor;
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedBasis basis;
 
-  // Load basis source files
-  // TODO: generalize to accept different device functions?
-  {
-    char       *tensor_basis_kernel_source;
-    const char *tensor_basis_kernel_path;
-
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source));
-    code << tensor_basis_kernel_source;
-    CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
-    CeedCallBackend(CeedFree(&tensor_basis_kernel_source));
-  }
-  {
-    char       *cuda_gen_template_source;
-    const char *cuda_gen_template_path;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+    if (basis != CEED_BASIS_NONE) {
+      bool is_field_tensor;
 
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-gen-templates.h", &cuda_gen_template_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Cuda-Gen Template Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source));
-    code << cuda_gen_template_source;
-    CeedCallBackend(CeedFree(&cuda_gen_template_path));
-    CeedCallBackend(CeedFree(&cuda_gen_template_source));
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      *is_all_tensor = *is_all_tensor && is_field_tensor;
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
 
-  // Get QFunction source and name
-  string qfunction_source(qf_data->qfunction_source);
-  string qfunction_name(qf_data->qfunction_name);
-  string operator_name;
-  operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
+  // Find max_P, max_P_1d, Q, and Q_1d
+  bool is_all_3d = true;
 
-  // Find dim, P_1d, Q_1d
-  data->max_P_1d = 0;
+  *max_P    = 0;
+  *max_P_1d = 0;
+  *Q        = 0;
+  *Q_1d     = 0;
   for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedBasis basis;
+
     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
-      bool is_tensor;
+      bool    is_field_tensor;
+      CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
 
-      CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      // Check if 3D
+      CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
+      is_all_3d = is_all_3d && (field_dim == 3);
+      *max_dim  = CeedIntMax(*max_dim, field_dim);
 
-      // Collect dim, P_1d, and Q_1d
-      CeedCallBackend(CeedBasisGetDimension(basis, &dim));
-      CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-      CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
-      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-      data->max_P_1d = CeedIntMax(data->max_P_1d, P_1d);
+      // Collect P, P_1d, Q, and Q_1d
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
+      *max_P = CeedIntMax(*max_P, field_P);
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+        *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      }
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
+      CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *Q = field_Q;
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+        CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+        *Q_1d = field_Q_1d;
+      }
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
-  // Check output bases for Q_1d, dim as well
-  //   The only input basis might be CEED_BASIS_NONE
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedBasis basis;
+
     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
-      bool is_tensor;
+      bool    is_field_tensor;
+      CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
 
-      CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Check if 3D
+      CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
+      is_all_3d = is_all_3d && (field_dim == 3);
+      *max_dim  = CeedIntMax(*max_dim, field_dim);
 
-      // Collect Q_1d
-      CeedCallBackend(CeedBasisGetDimension(basis, &dim));
-      CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-      CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      // Collect P, P_1d, Q, and Q_1d
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
+      *max_P = CeedIntMax(*max_P, field_P);
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+        *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      }
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
+      CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *Q = field_Q;
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+        CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+        *Q_1d = field_Q_1d;
+      }
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
-  data->dim  = dim;
-  data->Q_1d = Q_1d;
 
   // Only use 3D collocated gradient parallelization strategy when gradient is computed
-  // TODO: put in a function?
-  bool use_collograd_parallelization = false;
-
-  if (dim == 3) {
+  *use_3d_slices = false;
+  if (is_all_3d && *is_all_tensor) {
     bool was_grad_found = false;
 
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedEvalMode eval_mode;
+
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
+        CeedBasis_Cuda_shared *basis_data;
+        CeedBasis              basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        use_collograd_parallelization = basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
-        was_grad_found                = true;
+        *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
+        was_grad_found = true;
+        CeedCallBackend(CeedBasisDestroy(&basis));
       }
     }
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedEvalMode eval_mode;
+
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
+        CeedBasis_Cuda_shared *basis_data;
+        CeedBasis              basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        use_collograd_parallelization = basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
-        was_grad_found                = true;
+        *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
+        was_grad_found = true;
+        CeedCallBackend(CeedBasisDestroy(&basis));
       }
     }
   }
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Define CEED_Q_VLA
-  code << "\n#undef CEED_Q_VLA\n";
-  if (dim != 3 || use_collograd_parallelization) {
-    code << "#define CEED_Q_VLA 1\n\n";
-  } else {
-    code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
-  }
+//------------------------------------------------------------------------------
+// Setup fields
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
+                                                     CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse,
+                                                     CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
+                                                     bool use_3d_slices, bool skip_active_load) {
+  bool      is_tensor = true, is_active = true;
+  CeedBasis basis;
 
-  code << qfunction_source;
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+  {
+    CeedVector vec;
 
-  // Setup
-  code << "\n// -----------------------------------------------------------------------------\n";
-  code << "\nextern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar* W) {\n";
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << "  const CeedScalar* d_u_" << i << " = fields.inputs[" << i << "];\n";
-    }
+    CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "  CeedScalar* d_v_" << i << " = fields.outputs[" << i << "];\n";
-  }
+  const char            *field_name;
+  std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
+  std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
+  std::string            option_name = (is_input ? "inputs" : "outputs");
+  CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
+  CeedInt                elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
+  CeedElemRestriction    elem_rstr;
+  CeedBasis_Cuda_shared *basis_data;
 
-  code << "  const CeedInt dim = " << dim << ";\n";
-  code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
+  // Field reuse info
+  bool use_previous_field = field_reuse.index != -1;
 
-  code << "  extern __shared__ CeedScalar slice[];\n";
-  // TODO put in a function? InitSharedData_Cuda?
-  code << "  SharedData_Cuda data;\n";
-  code << "  data.t_id_x = threadIdx.x;\n";
-  code << "  data.t_id_y = threadIdx.y;\n";
-  code << "  data.t_id_z = threadIdx.z;\n";
-  code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice = slice+data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
+  CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
+  code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
 
-  code << "\n  // -- Input field constants and basis data --\n";
-  // TODO: Put in a function?
-  // Initialize constants, and matrices B and G
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "  // ---- Input field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+  // Get field data
+  CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
+  if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+  }
+  CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+  if (basis != CEED_BASIS_NONE) {
+    CeedCallBackend(CeedBasisGetData(basis, &basis_data));
+    CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+    if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
+  }
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
-    // Set field constants
-    if (eval_mode != CEED_EVAL_WEIGHT) {
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-      if (basis != CEED_BASIS_NONE) {
-        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-        code << "  const CeedInt P_in_" << i << " = " << P_1d << ";\n";
+  // Set field constants
+  code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
+  if (is_tensor && !is_all_tensor) {
+    CeedInt P = 0;
+
+    CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+    code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
+  }
+  code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
+  if (eval_mode != CEED_EVAL_WEIGHT) {
+    code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
+  }
+
+  // Load basis data
+  code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  switch (eval_mode) {
+    case CEED_EVAL_NONE:
+      break;
+    case CEED_EVAL_INTERP:
+      if (is_at_points) {
+        // AtPoints
+        if (!basis_data->d_chebyshev_interp_1d) {
+          CeedSize    interp_bytes;
+          CeedScalar *chebyshev_interp_1d;
+
+          interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+          CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+          CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+          CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
+          CeedCallCuda(CeedBasisReturnCeed(basis),
+                       cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
+          CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+        }
+        if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
+        else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
       } else {
-        code << "  const CeedInt P_in_" << i << " = " << Q_1d << ";\n";
+        // Standard quadrature
+        if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+        else data->B.outputs[i] = basis_data->d_interp_1d;
       }
-      code << "  const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n";
-    }
+      if (use_previous_field && !skip_active_load) {
+        std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-    // Load basis data
-    code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        break;
-      case CEED_EVAL_INTERP:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.inputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_in_" << i << ",Q_1d>(data, B.inputs[" << i << "], s_B_in_" << i << ");\n";
-        break;
-      case CEED_EVAL_GRAD:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.inputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_in_" << i << ",Q_1d>(data, B.inputs[" << i << "], s_B_in_" << i << ");\n";
-        if (use_collograd_parallelization) {
-          data->G.inputs[i] = basis_data->d_collo_grad_1d;
-          code << "  __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n";
-          code << "  loadMatrix<Q_1d,Q_1d>(data, G.inputs[" << i << "], s_G_in_" << i << ");\n";
+        code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
+      } else {
+        bool is_collocated = false;
+
+        CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+        if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
+          code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
         } else {
-          bool has_collo_grad = basis_data->d_collo_grad_1d;
-          data->G.inputs[i]   = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
-          code << "  __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
-          code << "  loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_in_" + std::to_string(i))) << ",Q_1d>(data, G.inputs[" << i << "], s_G_in_" << i
-               << ");\n";
+          code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+          code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
         }
-        break;
-      case CEED_EVAL_WEIGHT:
-        break;  // No action
-      case CEED_EVAL_DIV:
-        break;  // TODO: Not implemented
-      case CEED_EVAL_CURL:
-        break;  // TODO: Not implemented
-    }
-  }
-
-  code << "\n  // -- Output field constants and basis data --\n";
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "  // ---- Output field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      }
+      break;
+    case CEED_EVAL_GRAD:
+      if (is_at_points) {
+        // AtPoints
+        if (!basis_data->d_chebyshev_interp_1d) {
+          CeedSize    interp_bytes;
+          CeedScalar *chebyshev_interp_1d;
 
-    // Set field constants
-    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-    if (basis != CEED_BASIS_NONE) {
-      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-      code << "  const CeedInt P_out_" << i << " = " << P_1d << ";\n";
-    } else {
-      code << "  const CeedInt P_out_" << i << " = " << Q_1d << ";\n";
-    }
-    code << "  const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n";
+          interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+          CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+          CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+          CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
+          CeedCallCuda(CeedBasisReturnCeed(basis),
+                       cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
+          CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+        }
+        if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
+        else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
+      } else {
+        // Standard quadrature
+        if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+        else data->B.outputs[i] = basis_data->d_interp_1d;
+      }
+      if (is_tensor) {
+        if (use_previous_field && !skip_active_load) {
+          std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-    // Load basis data
-    code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        break;  // No action
-      case CEED_EVAL_INTERP:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.outputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_out_" << i << ",Q_1d>(data, B.outputs[" << i << "], s_B_out_" << i << ");\n";
-        break;
-      case CEED_EVAL_GRAD:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.outputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_out_" << i << ",Q_1d>(data, B.outputs[" << i << "], s_B_out_" << i << ");\n";
-        if (use_collograd_parallelization) {
-          data->G.outputs[i] = basis_data->d_collo_grad_1d;
-          code << "  __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n";
-          code << "  loadMatrix<Q_1d,Q_1d>(data, G.outputs[" << i << "], s_G_out_" << i << ");\n";
+          code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
         } else {
-          bool has_collo_grad = basis_data->d_collo_grad_1d;
-          data->G.outputs[i]  = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
-          code << "  __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
-          code << "  loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_out_" + std::to_string(i))) << ",Q_1d>(data, G.outputs[" << i << "], s_G_out_"
-               << i << ");\n";
+          bool is_collocated = false;
+
+          CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+          if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
+            code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
+          } else {
+            code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+            code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+          }
         }
-        break;
-      // LCOV_EXCL_START
-      case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
-        break;  // Should not occur
       }
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
-        break;  // Should not occur
+      if (is_at_points) break;  // No G mat for AtPoints
+      if (use_3d_slices) {
+        if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
+        else data->G.outputs[i] = basis_data->d_collo_grad_1d;
+        if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
+          std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
+
+          code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+        } else if (is_active && skip_active_load) {
+          code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
+        } else {
+          code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
+          code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+        }
+      } else {
+        bool has_collo_grad = basis_data->d_collo_grad_1d;
+
+        if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
+        else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
+        if (has_collo_grad) {
+          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
+            std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
+
+            code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          } else if (is_active && skip_active_load) {
+            code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
+          } else {
+            code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
+            code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          }
+        } else {
+          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
+            std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
+
+            code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          } else if (is_active && skip_active_load) {
+            code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
+          } else {
+            code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
+                 << (is_tensor ? "" : var_suffix) << "];\n";
+            code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
+                 << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          }
+        }
       }
-        // LCOV_EXCL_STOP
-    }
+      break;
+    case CEED_EVAL_WEIGHT:
+      break;  // No action
+      // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      break;  // TODO: Not implemented
+              // LCOV_EXCL_STOP
   }
-  code << "\n  // -- Element loop --\n";
-  code << "  __syncthreads();\n";
-  code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
-  // Input basis apply if needed
-  // Generate the correct eval mode code for each input
-  code << "    // -- Input field restrictions and basis actions --\n";
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "    // ---- Input field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+  CeedCallBackend(CeedBasisDestroy(&basis));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Restriction
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
+                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
+                                                       CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
+                                                       bool use_3d_slices) {
+  std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
+  std::string               P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
+  CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
+  CeedInt                   elem_size = 0, num_comp = 0;
+  CeedSize                  l_size;
+  CeedRestrictionType       rstr_type = CEED_RESTRICTION_STANDARD;
+  CeedElemRestriction_Cuda *rstr_data;
+  CeedElemRestriction       elem_rstr;
+
+  // Get field data
+  CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
+  if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
+    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
+  }
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
+
+  // Restriction
+  if (is_input) {
+    // Input
+    if (field_input_buffer[i] != i) {
+      std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
+
+      // Restriction was already done for previous input
+      code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
+    } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
+      if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
+        // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
+        code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+      } else if (rstr_type != CEED_RESTRICTION_POINTS) {
+        // Otherwise we're using the scratch space
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+      }
+      switch (rstr_type) {
+        case CEED_RESTRICTION_STANDARD: {
+          CeedInt comp_stride;
+
+          CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+          code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+          CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+          code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
+          code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
+               << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
+               << ");\n";
+          break;
+        }
+        case CEED_RESTRICTION_STRIDED: {
+          bool    has_backend_strides;
+          CeedInt num_elem;
 
-    // TODO: put in a function?
-    // Restriction
-    if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) {
-      code << "    CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n";
+          CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
+          CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
+          CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
 
-      bool is_strided;
+          if (!has_backend_strides) {
+            CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
+          }
+          code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
+               << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
+          code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
+               << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
+               << var_suffix << ");\n";
+          break;
+        }
+        case CEED_RESTRICTION_POINTS: {
+          CeedInt comp_stride;
 
-      CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-      if (!is_strided) {
+          CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+          code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
+          break;
+        }
+        // LCOV_EXCL_START
+        case CEED_RESTRICTION_ORIENTED:
+        case CEED_RESTRICTION_CURL_ORIENTED:
+          break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+  } else {
+    // Output
+    switch (rstr_type) {
+      case CEED_RESTRICTION_STANDARD: {
         CeedInt comp_stride;
 
         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-        code << "    const CeedInt l_size_in_" << i << " = " << l_size << ";\n";
+        code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-        code << "    // CompStride: " << comp_stride << "\n";
-        CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
-        data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-        code << "    readDofsOffset" << dim << "d<num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ">(data, l_size_in_" << i
-             << ", elem, indices.inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n";
-      } else {
+        code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+        data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
+        code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
+             << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
+             << ");\n";
+        break;
+      }
+      case CEED_RESTRICTION_STRIDED: {
         bool    has_backend_strides;
         CeedInt num_elem;
 
@@ -386,328 +493,2202 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
         if (!has_backend_strides) {
           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
         }
-        code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-        code << "    readDofsStrided" << dim << "d<num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
-             << ">(data, elem, d_u_" << i << ", r_u_" << i << ");\n";
+        code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
+             << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
+        code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
+             << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
+             << ");\n";
+        break;
       }
+      case CEED_RESTRICTION_POINTS:
+        data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
+        break;
+      // LCOV_EXCL_START
+      case CEED_RESTRICTION_ORIENTED:
+      case CEED_RESTRICTION_CURL_ORIENTED:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
     }
+  }
+  CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Basis
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
+                                                 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d,
+                                                 bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
+  bool      is_tensor = true, is_collocated = true;
+  CeedBasis basis;
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+  CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+
+  std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
+  std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
+  CeedEvalMode        eval_mode = CEED_EVAL_NONE;
+  CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedElemRestriction elem_rstr;
+
+  // Get field data
+  CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
+  if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+  }
+  CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+  if (basis != CEED_BASIS_NONE) {
+    CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+    if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
+  }
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
-    // TODO: put in a function?
-    // Basis action
-    code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  // Basis
+  code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  if (is_input) {
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        if (!use_collograd_parallelization) {
-          code << "    CeedScalar* r_t_" << i << " = r_u_" << i << ";\n";
+        if (!use_3d_slices && !is_at_points) {
+          code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
         }
         break;
       case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n";
-        code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_in_" << i << ",P_in_" << i << ",Q_1d>(data, r_u_" << i << ", s_B_in_"
-             << i << ", r_t_" << i << ");\n";
+        if (is_at_points) {
+          std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
+
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
+        } else {
+          std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
+                                                   std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                                                : "InterpNonTensor";
+          std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
+
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+               << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+        }
         break;
       case CEED_EVAL_GRAD:
-        if (use_collograd_parallelization) {
-          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_in_" << i << ",P_in_" << i << ",Q_1d>(data, r_u_" << i
-               << ", s_B_in_" << i << ", r_t_" << i << ");\n";
+        if (is_at_points) {
+          std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
+
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
+        } else if (use_3d_slices) {
+          std::string function_name =
+              (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d";
+
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+        } else if (is_tensor) {
+          bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
+          std::string function_name =
+              (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
+              std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
+          std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
+
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
+               << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+               << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
-          CeedInt P_1d;
+          std::string function_name = "GradNonTensor";
 
-          CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-          CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*dim*Q_1d];\n";
-          code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp_in_" << i
-               << ",P_in_" << i << ",Q_1d>(data, r_u_" << i << ", s_B_in_" << i << ", s_G_in_" << i << ", r_t_" << i << ");\n";
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+               << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
-      case CEED_EVAL_WEIGHT:
-        code << "    CeedScalar r_t_" << i << "[Q_1d];\n";
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->W = basis_data->d_q_weight_1d;
-        code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<Q_1d>(data, W, r_t_" << i << ");\n";
-        break;  // No action
+      case CEED_EVAL_WEIGHT: {
+        if (is_at_points) {
+          code << tab << "// Nothing to do AtPoints\n";
+        } else {
+          CeedBasis_Cuda_shared *basis_data;
+          std::string            function_name = is_tensor
+                                                     ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                                                     : "WeightNonTensor";
+
+          code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          CeedCallBackend(CeedBasisGetData(basis, &basis_data));
+          data->W = basis_data->d_q_weight_1d;
+          code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+        }
+        break;
+      }
+      // LCOV_EXCL_START
       case CEED_EVAL_DIV:
-        break;  // TODO: Not implemented
       case CEED_EVAL_CURL:
         break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
     }
-  }
+  } else {
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
+        break;  // No action
+      case CEED_EVAL_INTERP:
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+        if (is_at_points) {
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-  // TODO: put in a function + separate collograd logic
-  // Q function
-  code << "\n    // -- Output field setup --\n";
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "\n    // ---- Output field " << i << " ----\n";
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_GRAD) {
-      if (use_collograd_parallelization) {
-        // Accumulator for gradient slices
-        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1d];\n";
-        code << "    for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n";
-        code << "      for (CeedInt j = 0; j < Q_1d; ++j) {\n";
-        code << "        r_tt_" << i << "[j + i*Q_1d] = 0.0;\n";
-        code << "      }\n";
-        code << "    }\n";
-      } else {
-        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*dim*Q_1d];\n";
-      }
-    }
-    if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
-      code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1d];\n";
-    }
-  }
-  // We treat quadrature points per slice in 3d to save registers
-  if (use_collograd_parallelization) {
-    code << "\n    // Note: Using planes of 3D elements\n";
-    code << "#pragma unroll\n";
-    code << "    for (CeedInt q = 0; q < Q_1d; q++) {\n";
-    code << "      // -- Input fields --\n";
-    for (CeedInt i = 0; i < num_input_fields; i++) {
-      code << "      // ---- Input field " << i << " ----\n";
-      // Get elem_size, eval_mode, num_comp
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-      // Basis action
-      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-      switch (eval_mode) {
-        case CEED_EVAL_NONE:
-          bool is_strided;
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else {
+          std::string function_name =
+              is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
+                           std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                        : "InterpTransposeNonTensor";
+          std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+               << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        }
+        break;
+      case CEED_EVAL_GRAD:
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+        if (is_at_points) {
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-          CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-          if (!is_strided) {
-            CeedInt comp_stride;
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else if (use_3d_slices) {
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
+                                      std::to_string(dim) + "d";
 
-            CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-            code << "      const CeedInt l_size_in_" << i << " = " << l_size << ";\n";
-            CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-            code << "      // CompStride: " << comp_stride << "\n";
-            CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
-            data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-            code << "      readSliceQuadsOffset"
-                 << "3d<num_comp_in_" << i << ", " << comp_stride << ", Q_1d>(data, l_size_in_" << i << ", elem, q, indices.inputs[" << i << "], d_u_"
-                 << i << ", r_q_" << i << ");\n";
-          } else {
-            bool    has_backend_strides;
-            CeedInt num_elem;
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else if (is_tensor) {
+          bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
+          std::string function_name =
+              (dim == 1 ? "GradTranspose"
+                        : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
+              std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
+          std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-            CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-            CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
-            CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
-            CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+               << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else {
+          std::string function_name = "GradTransposeNonTensor";
 
-            if (!has_backend_strides) {
-              CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
-            }
-            code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-            code << "      readSliceQuadsStrided"
-                 << "3d<num_comp_in_" << i
-                 << ",Q_1d"
-                    ","
-                 << strides[0] << "," << strides[1] << "," << strides[2] << ">(data, elem, q, d_u_" << i << ", r_q_" << i << ");\n";
-          }
-          break;
-        case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
-          code << "      for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n";
-          code << "        r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1d];\n";
-          code << "      }\n";
-          break;
-        case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "*dim];\n";
-          code << "      gradCollo3d<num_comp_in_" << i << ",Q_1d>(data, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ");\n";
-          break;
-        case CEED_EVAL_WEIGHT:
-          code << "      CeedScalar r_q_" << i << "[1];\n";
-          code << "      r_q_" << i << "[0] = r_t_" << i << "[q];\n";
-          break;  // No action
-        case CEED_EVAL_DIV:
-          break;  // TODO: Not implemented
-        case CEED_EVAL_CURL:
+          code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+               << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+        }
+        break;
+      // LCOV_EXCL_START
+      case CEED_EVAL_WEIGHT:
+        break;  // Should not occur
+      case CEED_EVAL_DIV:
+      case CEED_EVAL_CURL:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
+    }
+  }
+  CeedCallBackend(CeedBasisDestroy(&basis));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// QFunction
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt max_dim,
+                                                     CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
+                                                     CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
+                                                     CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
+                                                     std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
+                                                     bool use_3d_slices, bool is_assemble) {
+  std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
+  CeedEvalMode        eval_mode = CEED_EVAL_NONE;
+  CeedElemRestriction elem_rstr;
+
+  // Setup output arrays
+  code << "\n";
+  code << tab << "// -- Output field setup\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    const char *field_name;
+    std::string var_suffix = "_out_" + std::to_string(i);
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        if (is_at_points) {
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
+        } else {
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
+               << "];\n";
+        }
+        break;
+      case CEED_EVAL_INTERP:
+        if (is_at_points) {
+          // Accumulator for point data
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
+               << "[i] = 0.0;\n";
+        } else {
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
+               << "];\n";
+        }
+        break;
+      case CEED_EVAL_GRAD:
+        if (is_at_points) {
+          // Accumulator for point data
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
+               << "[i] = 0.0;\n";
+        } else if (use_3d_slices) {
+          // Accumulator for gradient slices
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
+        } else {
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
+               << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
+        }
+        break;
+      case CEED_EVAL_WEIGHT:
+        break;
+        // LCOV_EXCL_START
+      case CEED_EVAL_DIV:
+      case CEED_EVAL_CURL:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
+    }
+  }
+
+  if (is_at_points) {
+    // We need to handle batches of points
+    code << "\n";
+    code << tab << "// Note: Using batches of points\n";
+    code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
+    code << tab << "#pragma unroll\n";
+    code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
+    tab.push();
+    code << tab << "const CeedInt p = i % max_num_points;\n\n";
+
+    code << tab << "// -- Coordinates\n";
+    code << tab << "CeedScalar r_x[max_dim];\n";
+    code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
+
+    code << tab << "// -- Input fields\n";
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      const char *field_name;
+      std::string var_suffix = "_in_" + std::to_string(i);
+      std::string P_name     = "P_1d" + var_suffix;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+      code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      // Basis action
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
+               << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_INTERP:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_WEIGHT:
+          code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
+          code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
-    code << "\n      // -- Output fields --\n";
+    code << "\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << " ----\n";
+      const char *field_name;
+      std::string var_suffix = "_out_" + std::to_string(i);
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
-          break;  // No action
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          break;
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*dim];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
           break;
+          // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
         case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
+          break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+
+  } else if (use_3d_slices) {
+    // We treat quadrature points per slice in 3d to save registers
+    code << "\n";
+    code << tab << "// Note: Using planes of 3D elements\n";
+    code << tab << "#pragma unroll\n";
+    code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
+    tab.push();
+    code << tab << "// -- Input fields\n";
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      const char *field_name;
+      std::string var_suffix = "_in_" + std::to_string(i);
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+      code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      // Basis action
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          bool is_strided;
+
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+          CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
+          if (is_strided) {
+            bool    has_backend_strides;
+            CeedInt num_elem, elem_size;
+
+            CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+            CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
+            CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
+            CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+
+            if (!has_backend_strides) {
+              CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
+            }
+            code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
+                 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
+            code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
+                 << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
+          } else {
+            CeedSize                  l_size = 0;
+            CeedInt                   comp_stride;
+            CeedElemRestriction_Cuda *rstr_data;
+
+            CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+            code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+            CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+            code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+            CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
+            data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
+            code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
+                 << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
+          }
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+          break;
+        case CEED_EVAL_INTERP:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
+          tab.push();
+          code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
+          tab.pop();
+          code << tab << "}\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
+               << var_suffix << ", r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_WEIGHT:
+          code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
+          code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+    code << "\n";
+    code << tab << "// -- Output fields\n";
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      const char *field_name;
+      std::string var_suffix = "_out_" + std::to_string(i);
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Basis action
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          break;
+        case CEED_EVAL_INTERP:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_WEIGHT:
+          break;  // Should not occur
+        case CEED_EVAL_DIV:
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
   } else {
-    code << "\n      // Note: Using full elements\n";
-    code << "      // -- Input fields --\n";
+    code << "\n";
+    code << tab << "// Note: Using full elements\n";
+    code << tab << "{\n";
+    tab.push();
+    code << tab << "// -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      code << "      // ---- Input field " << i << " ----\n";
-      code << "      CeedScalar* r_q_" << i << " = r_t_" << i << ";\n";
+      const char *field_name;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+      code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
+      code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
     }
-    code << "      // -- Output fields --\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << " ----\n";
-      code << "      CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n";
+      const char *field_name;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
     }
   }
-  code << "\n      // -- QFunction Inputs and outputs --\n";
-  code << "      CeedScalar* in[" << num_input_fields << "];\n";
+
+  // Input and output buffers
+  code << "\n";
+  code << tab << "// -- QFunction inputs and outputs\n";
+  code << tab << "// ---- Inputs\n";
+  code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "      // ---- Input field " << i << " ----\n";
-    code << "      in[" << i << "] = r_q_" << i << ";\n";
+    const char *field_name;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+    code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
+    code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
   }
-  code << "      CeedScalar* out[" << num_output_fields << "];\n";
+  code << tab << "// ---- Outputs\n";
+  code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "      // ---- Output field " << i << " ----\n";
-    code << "      out[" << i << "] = r_qq_" << i << ";\n";
+    const char *field_name;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
+    code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
   }
-  code << "\n      // -- Apply QFunction --\n";
-  code << "      " << qfunction_name << "(ctx, ";
-  if (dim != 3 || use_collograd_parallelization) {
+
+  // Apply QFunction
+  code << "\n";
+  code << tab << "// -- Apply QFunction\n";
+  code << tab << "" << qfunction_name << "(ctx, ";
+  if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
     code << "1";
   } else {
-    code << "Q_1d";
+    code << Q_name;
   }
-  code << ", in, out);\n";
-  if (use_collograd_parallelization) {
-    code << "      // -- Output fields --\n";
+  code << ", inputs, outputs);\n";
+
+  if (is_at_points) {
+    // Map back to coefficients
+    code << "\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << " ----\n";
+      const char *field_name;
+      std::string var_suffix = "_out_" + std::to_string(i);
+      std::string P_name     = "P_1d" + var_suffix;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
-      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
       switch (eval_mode) {
-        case CEED_EVAL_NONE:
-          code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
-          code << "        r_tt_" << i << "[q + j*Q_1d] = r_qq_" << i << "[j];\n";
-          code << "      }\n";
-          break;  // No action
+        case CEED_EVAL_NONE: {
+          CeedInt             comp_stride;
+          CeedElemRestriction elem_rstr;
+
+          if (is_assemble) break;
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+          CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+          code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
+               << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
+               << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
+          break;
+        }
         case CEED_EVAL_INTERP:
-          code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
-          code << "        r_tt_" << i << "[q + j*Q_1d] = r_qq_" << i << "[j];\n";
-          code << "      }\n";
+          code << tab << "if (i >= points.num_per_elem[elem]) {\n";
+          tab.push();
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          tab.pop();
+          code << tab << "}\n";
+          code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      gradColloTranspose3d<num_comp_out_" << i << ",Q_1d>(data, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i << ");\n";
+          code << tab << "if (i >= points.num_per_elem[elem]) {\n";
+          tab.push();
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          tab.pop();
+          code << tab << "}\n";
+          code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
+          // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
         case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+  } else if (use_3d_slices) {
+    // Copy or apply transpose grad, if needed
+    code << "\n";
+    code << tab << "// -- Output fields\n";
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      const char *field_name;
+      std::string var_suffix = "_out_" + std::to_string(i);
+      std::string P_name     = "P_1d" + var_suffix;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Basis action
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
+          tab.push();
+          code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
+          tab.pop();
+          code << tab << "}\n";
+          break;
+        case CEED_EVAL_INTERP:
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
+          tab.push();
+          code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
+          tab.pop();
+          code << tab << "}\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
+               << var_suffix << ", r_q" << var_suffix << ");\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_WEIGHT:
+          break;  // Should not occur
+        case CEED_EVAL_DIV:
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
-    code << "    }\n";
   }
+  tab.pop();
+  code << tab << "}\n";
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Output basis apply if needed
-  // Generate the correct eval mode code for each output
-  code << "\n    // -- Output field basis action and restrictions --\n";
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "    // ---- Output field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-    // TODO put in a function
-    // Basis action
-    code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        code << "    CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n";
-        break;  // No action
-      case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
-        code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_out_" << i << ",P_out_" << i << ",Q_1d>(data, r_tt_" << i
-             << ", s_B_out_" << i << ", r_v_" << i << ");\n";
-        break;
-      case CEED_EVAL_GRAD:
-        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
-        if (use_collograd_parallelization) {
-          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_out_" << i << ",P_out_" << i << ",Q_1d>(data, r_tt_" << i
-               << ", s_B_out_" << i << ", r_v_" << i << ");\n";
-        } else {
-          CeedInt P_1d;
-          CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-          CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-          code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp_out_" << i
-               << ",P_out_" << i << ",Q_1d>(data, r_tt_" << i << ", s_B_out_" << i << ", s_G_out_" << i << ", r_v_" << i << ");\n";
-        }
-        break;
-      // LCOV_EXCL_START
-      case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
-        break;  // Should not occur
-      }
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
-        break;  // Should not occur
+//------------------------------------------------------------------------------
+// Build single operator kernel
+//------------------------------------------------------------------------------
+extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
+  bool                    is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
+  Ceed                    ceed;
+  CeedInt                 Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
+  CeedQFunctionField     *qf_input_fields, *qf_output_fields;
+  CeedQFunction_Cuda_gen *qf_data;
+  CeedQFunction           qf;
+  CeedOperatorField      *op_input_fields, *op_output_fields;
+  CeedOperator_Cuda_gen  *data;
+  std::ostringstream      code;
+  Tab                     tab;
+
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  {
+    bool is_setup_done;
+
+    CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
+    if (is_setup_done) {
+      *is_good_build = !data->use_fallback;
+      return CEED_ERROR_SUCCESS;
+    }
+  }
+
+  // Check field compatibility
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  {
+    bool has_shared_bases = true;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
       }
-        // LCOV_EXCL_STOP
+      CeedCallBackend(CeedBasisDestroy(&basis));
     }
-    // TODO put in a function
-    // Restriction
-    bool is_strided;
-    CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-    if (!is_strided) {
-      CeedInt comp_stride;
 
-      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-      code << "    const CeedInt l_size_out_" << i << " = " << l_size << ";\n";
-      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-      code << "    // CompStride: " << comp_stride << "\n";
-      CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
-      data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
-      code << "    writeDofsOffset" << dim << "d<num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ">(data, l_size_out_" << i
-           << ", elem, indices.outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n";
-    } else {
-      bool    has_backend_strides;
-      CeedInt num_elem;
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
 
-      CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
-      CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
-      CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
 
-      if (!has_backend_strides) {
-        CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
       }
-      code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-      code << "    writeDofsStrided" << dim << "d<num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
-           << ">(data, elem, r_v_" << i << ", d_v_" << i << ");\n";
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+    // -- Fallback to ref if not all bases are shared
+    if (!has_shared_bases) {
+      *is_good_build = false;
+      return CEED_ERROR_SUCCESS;
     }
   }
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Get operator data
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  {
+    CeedInt max_P = 0, max_P_1d = 0;
+
+    CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
+                                                         op_output_fields, qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor,
+                                                         &use_3d_slices));
+    data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
+  }
+  if (is_at_points) {
+    CeedInt                   coords_dim = 0;
+    CeedElemRestriction_Cuda *rstr_data;
+    CeedElemRestriction       rstr_points = NULL;
 
-  code << "  }\n";
-  code << "}\n";
-  code << "// -----------------------------------------------------------------------------\n\n";
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim));
+    CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
+    data->points.indices = (CeedInt *)rstr_data->d_offsets;
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+    if (max_dim == 0) max_dim = coords_dim;
+    if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim));
+  }
+  if (max_dim == 0) max_dim = 1;
+  data->dim = max_dim;
+  if (is_at_points) use_3d_slices = false;
+  if (Q_1d == 0) {
+    if (is_at_points) Q_1d = max_num_points;
+    else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
+  }
+  if (Q == 0) Q = Q_1d;
+  data->Q    = Q;
+  data->Q_1d = Q_1d;
 
-  // View kernel for debugging
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated Operator Kernels:\n");
-  CeedDebug(ceed, code.str().c_str());
+  // Check for restriction only identity operator
+  {
+    bool is_identity_qf;
 
-  CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
+    CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
+    if (is_identity_qf) {
+      CeedEvalMode eval_mode_in, eval_mode_out;
 
-  CeedCallBackend(CeedOperatorSetSetupDone(op));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
+      CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
+                "Backend does not implement restriction only identity operators");
+    }
+  }
+
+  // Add atomicAdd function for old NVidia architectures
+  {
+    Ceed_Cuda            *ceed_data;
+    struct cudaDeviceProp prop;
+
+    CeedCallBackend(CeedGetData(ceed, &ceed_data));
+    CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
+    if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
+      code << tab << "// AtomicAdd fallback source\n";
+      code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
+    }
+  }
+
+  // Load basis source files
+  if (!is_all_nontensor) {
+    code << tab << "// Tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
+  }
+  if (!is_all_tensor) {
+    code << tab << "// Non-tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
+  }
+  if (!is_all_tensor && !is_all_nontensor) {
+    code << "// Tensor basis source\n";
+    code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h>\n\n";
+  }
+  if (is_at_points) {
+    code << "// AtPoints basis source\n";
+    code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
+  }
+  code << "// CodeGen operator source\n";
+  code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
+
+  // Get QFunction name
+  std::string qfunction_name(qf_data->qfunction_name);
+  std::string operator_name;
+
+  operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
+
+  // Define CEED_Q_VLA
+  code << "\n" << tab << "#undef CEED_Q_VLA\n";
+  if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
+    code << tab << "#define CEED_Q_VLA 1\n\n";
+  } else {
+    code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+  }
+
+  // Add user QFunction source
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
+
+    code << tab << "// User QFunction source\n";
+    code << tab << "#include \"" << source_path << "\"\n\n";
+  }
+
+  // Setup
+  code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "// Operator Kernel\n";
+  code << tab << "// \n";
+  code << tab << "// d_[in,out]_i:   CeedVector device array\n";
+  code << tab << "// r_[in,out]_e_i: Element vector register\n";
+  code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
+  code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
+  code << tab << "// \n";
+  code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "extern \"C\" __global__ void " << operator_name
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
+          "points) {\n";
+  tab.push();
+
+  // Scratch buffers
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+  }
+
+  code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
+  if (!is_all_tensor) {
+    code << tab << "const CeedInt Q = " << Q << ";\n";
+  }
+  if (!is_all_nontensor) {
+    code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
+  }
+  if (is_at_points) {
+    code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
+    code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
+  }
+
+  // Shared data
+  code << tab << "extern __shared__ CeedScalar slice[];\n";
+  code << tab << "SharedData_Cuda data;\n";
+  code << tab << "data.t_id_x = threadIdx.x;\n";
+  code << tab << "data.t_id_y = threadIdx.y;\n";
+  code << tab << "data.t_id_z = threadIdx.z;\n";
+  code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+
+  // -- Determine input mat reuse
+  FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    input_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool         is_tensor = true;
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
+    if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
+    for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          input_matrix_reuse[i].index     = j;
+          input_matrix_reuse[i].is_input  = true;
+          input_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            input_matrix_reuse[i].index     = j;
+            input_matrix_reuse[i].is_input  = true;
+            input_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // -- Determine output mat reuse
+  FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    output_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool         is_tensor = true;
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = true;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = true;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = false;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = false;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // Initialize constants, and matrices B and G
+  code << "\n" << tab << "// Input field constants and basis data\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
+                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
+  }
+  code << "\n" << tab << "// Output field constants and basis data\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
+  }
+
+  // Loop over all elements
+  code << "\n" << tab << "// Element loop\n";
+  code << tab << "__syncthreads();\n";
+  code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  tab.push();
+
+  // -- Compute minimum buffer space needed
+  CeedInt max_rstr_buffer_size = 1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  code << tab << "// Scratch restriction buffer space\n";
+  code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+
+  // -- Determine best input field processing order
+  CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    field_rstr_in_buffer[i] = -1;
+    input_field_order[i]    = -1;
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      field_rstr_in_buffer[i]       = i;
+      is_ordered[i]                 = true;
+      input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          field_rstr_in_buffer[j]       = i;
+          is_ordered[j]                 = true;
+          input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+
+  // -- Input restriction and basis
+  code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    // ---- Restriction
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                                max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                          is_all_tensor, is_at_points, use_3d_slices));
+  }
+
+  // -- Q function
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
+                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
+                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices, false));
+
+  // -- Output basis and restriction
+  code << "\n" << tab << "// -- Output field basis action and restrictions\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    const char *field_name;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
+                                                          is_all_tensor, is_at_points, use_3d_slices));
+
+    // ---- Restriction
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
+                                                                false, is_all_tensor, is_at_points, use_3d_slices));
+  }
+
+  // Close loop and function
+  tab.pop();
+  code << tab << "}\n";
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "// -----------------------------------------------------------------------------\n\n";
+
+  // Compile
+  {
+    bool          is_compile_good = false;
+    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+
+    data->thread_1d = T_1d;
+    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
+    } else {
+      *is_good_build     = false;
+      data->use_fallback = true;
+    }
+  }
+  CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Build AtPoints assembly operator kernel
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, bool is_full, bool *is_good_build) {
+  bool                    is_all_tensor = true, is_at_points = false, use_3d_slices = false;
+  Ceed                    ceed;
+  CeedInt                 Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
+  CeedQFunctionField     *qf_input_fields, *qf_output_fields;
+  CeedQFunction_Cuda_gen *qf_data;
+  CeedQFunction           qf;
+  CeedOperatorField      *op_input_fields, *op_output_fields;
+  CeedOperator_Cuda_gen  *data;
+  std::ostringstream      code;
+  Tab                     tab;
+
+  // Check compatibility
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
+
+  // Retrieve operator data
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  Q       = data->Q;
+  Q_1d    = data->Q_1d;
+  max_dim = data->dim;
+  {
+    CeedElemRestriction rstr_points = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+
+  // Add atomicAdd function for old NVidia architectures
+  {
+    Ceed_Cuda            *ceed_data;
+    struct cudaDeviceProp prop;
+
+    CeedCallBackend(CeedGetData(ceed, &ceed_data));
+    CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
+    if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
+      code << tab << "// AtomicAdd fallback source\n";
+      code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
+    }
+  }
+
+  // Load basis source files
+  code << tab << "// Tensor basis source\n";
+  code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
+  code << tab << "// AtPoints basis source\n";
+  code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
+  code << tab << "// CodeGen operator source\n";
+  code << tab << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
+
+  // Get QFunction name
+  std::string qfunction_name(qf_data->qfunction_name);
+  std::string operator_name;
+
+  if (is_full) {
+    operator_name = "CeedKernelCudaGenOperatorFullAssembly_" + qfunction_name;
+  } else {
+    operator_name = "CeedKernelCudaGenOperatorDiagonalAssembly_" + qfunction_name;
+  }
+
+  // Define CEED_Q_VLA
+  code << "\n" << tab << "#undef CEED_Q_VLA\n";
+  code << tab << "#define CEED_Q_VLA 1\n\n";
+
+  // Add user QFunction source
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
+
+    code << tab << "// User QFunction source\n";
+    code << tab << "#include \"" << source_path << "\"\n\n";
+  }
+
+  // Setup
+  code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "// Operator Assembly Kernel\n";
+  code << tab << "// \n";
+  code << tab << "// d_[in,out]_i:   CeedVector device array\n";
+  code << tab << "// r_[in,out]_e_i: Element vector register\n";
+  code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
+  code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
+  code << tab << "// \n";
+  code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "extern \"C\" __global__ void " << operator_name
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
+          "points, CeedScalar *__restrict__ values_array) {\n";
+  tab.push();
+
+  // Scratch buffers
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+  }
+
+  code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
+  code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
+  code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
+  code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
+
+  // Shared data
+  code << tab << "extern __shared__ CeedScalar slice[];\n";
+  code << tab << "SharedData_Cuda data;\n";
+  code << tab << "data.t_id_x = threadIdx.x;\n";
+  code << tab << "data.t_id_y = threadIdx.y;\n";
+  code << tab << "data.t_id_z = threadIdx.z;\n";
+  code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+
+  // -- Determine input mat reuse
+  FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    input_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
+    if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        input_matrix_reuse[i].index     = j;
+        input_matrix_reuse[i].is_input  = true;
+        input_matrix_reuse[i].eval_mode = eval_mode_j;
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // -- Determine output mat reuse
+  FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    output_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        output_matrix_reuse[i].index     = j;
+        output_matrix_reuse[i].is_input  = true;
+        output_matrix_reuse[i].eval_mode = eval_mode_j;
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        output_matrix_reuse[i].index     = j;
+        output_matrix_reuse[i].is_input  = false;
+        output_matrix_reuse[i].eval_mode = eval_mode_j;
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // Initialize constants, and matrices B and G
+  code << "\n" << tab << "// Input field constants and basis data\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
+                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
+  }
+  code << "\n" << tab << "// Output field constants and basis data\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
+  }
+
+  // Loop over all elements
+  code << "\n" << tab << "// Element loop\n";
+  code << tab << "__syncthreads();\n";
+  code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  tab.push();
+
+  // -- Compute minimum buffer space needed
+  CeedInt max_rstr_buffer_size = 1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  code << tab << "// Scratch restriction buffer space\n";
+  code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+
+  // -- Determine best input field processing order
+  CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    field_rstr_in_buffer[i] = -1;
+    input_field_order[i]    = -1;
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      field_rstr_in_buffer[i]       = i;
+      is_ordered[i]                 = true;
+      input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          field_rstr_in_buffer[j]       = i;
+          is_ordered[j]                 = true;
+          input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+
+  // -- Input restriction and basis
+  code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
+  CeedInt active_field_index = -1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool          is_active = false;
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    if (is_active) {
+      std::string var_suffix = "_in_" + std::to_string(f);
+
+      code << tab << "// Active field - no restriction or basis action here\n";
+      if (active_field_index == -1) {
+        active_field_index = f;
+        code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
+             << "] = {0.0};\n";
+      } else {
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
+      }
+    } else {
+      // ---- Restriction
+      CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+
+      // ---- Basis action
+      CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                            is_all_tensor, is_at_points, use_3d_slices));
+    }
+  }
+
+  // -- Loop over active field
+  std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
+
+  code << "\n" << tab << "// Loop over nodes in active field\n";
+  code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
+       << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
+  tab.push();
+
+  // -- Set current active node and component to 1
+  code << tab << "// Set current active node and component to 1.0\n";
+  code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
+       << active_var_suffix << ");\n\n";
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool          is_active = false;
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (!is_active) continue;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                          is_all_tensor, is_at_points, use_3d_slices));
+  }
+
+  // -- Q function
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
+                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
+                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
+
+  // -- Output basis and restriction
+  code << "\n" << tab << "// -- Output field basis action and restrictions\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool        is_active = false;
+    const char *field_name;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (!is_active) continue;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
+                                                          is_all_tensor, is_at_points, use_3d_slices));
+
+    // ---- Restriction
+    if (is_full) {
+      std::string         var_suffix = "_out_" + std::to_string(i);
+      CeedInt             comp_stride;
+      CeedSize            l_size;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+      code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+      code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+      code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
+           << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    } else {
+      std::string         var_suffix = "_out_" + std::to_string(i);
+      CeedInt             comp_stride;
+      CeedSize            l_size;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+      code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+      code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+      code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
+           << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+
+  // -- Reset current active node and component
+  code << "\n" << tab << "// Reset current active node and component to 0.0\n";
+  code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
+       << active_var_suffix << ");\n";
+
+  // -- End of loop over active field
+  tab.pop();
+  code << tab << "}\n";
+
+  // Close loop and function
+  tab.pop();
+  code << tab << "}\n";
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "// -----------------------------------------------------------------------------\n\n";
+
+  // Compile
+  {
+    bool          is_compile_good = false;
+    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+
+    data->thread_1d = T_1d;
+    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
+                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
+                                         is_full ? &data->assemble_full : &data->assemble_diagonal));
+    } else {
+      *is_good_build              = false;
+      data->use_assembly_fallback = true;
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
+
+extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build) {
+  return CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(op, false, is_good_build);
+}
+
+extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build) {
+  return CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(op, true, is_good_build);
+}
+
+//------------------------------------------------------------------------------
+// Build QFunction assembly operator kernel
+//------------------------------------------------------------------------------
+extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOperator op, bool *is_good_build) {
+  bool                    is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
+  Ceed                    ceed;
+  CeedInt                 Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0;
+  CeedQFunctionField     *qf_input_fields, *qf_output_fields;
+  CeedQFunction_Cuda_gen *qf_data;
+  CeedQFunction           qf;
+  CeedOperatorField      *op_input_fields, *op_output_fields;
+  CeedOperator_Cuda_gen  *data;
+  std::ostringstream      code;
+  Tab                     tab;
+
+  // Check compatibility
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported");
+
+  // Check field compatibility
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  {
+    bool has_shared_bases = true;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
+
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+  }
+
+  // Retrieve operator data
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  Q       = data->Q;
+  Q_1d    = data->Q_1d;
+  max_dim = data->dim;
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Add atomicAdd function for old NVidia architectures
+  {
+    Ceed_Cuda            *ceed_data;
+    struct cudaDeviceProp prop;
+
+    CeedCallBackend(CeedGetData(ceed, &ceed_data));
+    CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
+    if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
+      code << tab << "// AtomicAdd fallback source\n";
+      code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
+    }
+  }
+
+  // Load basis source files
+  if (!is_all_nontensor) {
+    code << tab << "// Tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
+  }
+  if (!is_all_tensor) {
+    code << tab << "// Non-tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
+  }
+  if (!is_all_tensor && !is_all_nontensor) {
+    code << "// Tensor basis source\n";
+    code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h>\n\n";
+  }
+  code << "// CodeGen operator source\n";
+  code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
+
+  // Get QFunction name
+  std::string qfunction_name(qf_data->qfunction_name);
+  std::string operator_name;
+
+  operator_name = "CeedKernelCudaGenQFunctionAssembly_" + qfunction_name;
+
+  // Define CEED_Q_VLA
+  code << "\n" << tab << "#undef CEED_Q_VLA\n";
+  if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
+    code << tab << "#define CEED_Q_VLA 1\n\n";
+  } else {
+    code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+  }
+
+  // Add user QFunction source
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
+
+    code << tab << "// User QFunction source\n";
+    code << tab << "#include \"" << source_path << "\"\n\n";
+  }
+
+  // Setup
+  code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "// Operator Assembly Kernel\n";
+  code << tab << "// \n";
+  code << tab << "// d_[in,out]_i:   CeedVector device array\n";
+  code << tab << "// r_[in,out]_e_i: Element vector register\n";
+  code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
+  code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
+  code << tab << "// \n";
+  code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "extern \"C\" __global__ void " << operator_name
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
+          "points, CeedScalar *__restrict__ values_array) {\n";
+  tab.push();
+
+  // Scratch buffers
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool is_active = false;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (is_active) {
+      code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    }
+  }
+
+  code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
+  if (!is_all_tensor) {
+    code << tab << "const CeedInt Q = " << Q << ";\n";
+  }
+  if (!is_all_nontensor) {
+    code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
+  }
+
+  // Shared data
+  code << tab << "extern __shared__ CeedScalar slice[];\n";
+  code << tab << "SharedData_Cuda data;\n";
+  code << tab << "data.t_id_x = threadIdx.x;\n";
+  code << tab << "data.t_id_y = threadIdx.y;\n";
+  code << tab << "data.t_id_z = threadIdx.z;\n";
+  code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+
+  // -- Determine input mat reuse
+  FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    input_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool         is_tensor = true;
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
+    if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
+    for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          input_matrix_reuse[i].index     = j;
+          input_matrix_reuse[i].is_input  = true;
+          input_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            input_matrix_reuse[i].index     = j;
+            input_matrix_reuse[i].is_input  = true;
+            input_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // -- Determine output mat reuse
+  FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    output_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool         is_tensor = true;
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = true;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = true;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = false;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = false;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // Initialize constants, and matrices B and G
+  code << "\n" << tab << "// Input field constants and basis data\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
+                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true));
+  }
+  code << "\n" << tab << "// Output field constants and basis data\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true));
+  }
+
+  // Loop over all elements
+  code << "\n" << tab << "// Element loop\n";
+  code << tab << "__syncthreads();\n";
+  code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  tab.push();
+
+  // -- Compute minimum buffer space needed
+  CeedInt max_rstr_buffer_size = 1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  code << tab << "// Scratch restriction buffer space\n";
+  code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+
+  // -- Determine best input field processing order
+  CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    field_rstr_in_buffer[i] = -1;
+    input_field_order[i]    = -1;
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      field_rstr_in_buffer[i]       = i;
+      is_ordered[i]                 = true;
+      input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          field_rstr_in_buffer[j]       = i;
+          is_ordered[j]                 = true;
+          input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+
+  // -- Input restriction and basis
+  code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
+  CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0;
+  CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool          is_active = false;
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    if (is_active) {
+      CeedEvalMode eval_mode;
+      CeedInt      field_size;
+
+      active_fields_in[num_active_in] = f;
+      num_active_in++;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
+      if (eval_mode == CEED_EVAL_GRAD) {
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
+             << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+      } else {
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+      }
+      code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n";
+    } else {
+      // ---- Restriction
+      CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+
+      // ---- Basis action
+      CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                            is_all_tensor, is_at_points, use_3d_slices));
+    }
+  }
+  code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {";
+  for (CeedInt i = 0; i < num_active_in; i++) {
+    code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
+  }
+  code << "};\n";
+  code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {";
+  for (CeedInt i = 0; i < num_active_in; i++) {
+    code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
+  }
+  code << "};\n";
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool is_active = false;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (is_active) {
+      const char *field_name;
+      CeedInt     field_size;
+
+      active_fields_out[num_active_out] = i;
+      num_active_out++;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+      qf_assembly_size_out += field_size;
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n";
+    }
+  }
+  code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {";
+  for (CeedInt i = 0; i < num_active_out; i++) {
+    code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : "");
+  }
+  code << "};\n";
+  code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n";
+
+  // -- Loop over active field
+  code << "\n" << tab << "CeedInt input_offset = 0;\n";
+  code << tab << "// Loop over active QFunction input fields\n";
+  code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n";
+  code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n";
+  tab.push();
+
+  // -- Loop over size of active field
+  code << "\n" << tab << "// Loop over current active input field size\n";
+  code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n";
+  code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n";
+  tab.push();
+
+  // -- Set current active point and component to 1
+  code << tab << "// Set current active point and component to 1.0\n";
+  if (is_all_tensor && (max_dim >= 3)) {
+    code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n";
+  } else {
+    code << tab << "r_q_in[a][s] = 1.0;\n";
+  }
+
+  // -- Q function
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
+                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
+                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
+
+  // -- Output basis and restriction
+  code << "\n" << tab << "// -- Output field basis action and restrictions\n";
+  CeedScalar offset = 0;
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool        is_active = false;
+    const char *field_name;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (!is_active) continue;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+
+    // ---- Restriction
+    CeedInt field_size;
+
+    code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
+         << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
+    CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+    offset += field_size;
+  }
+
+  // -- Reset current active node and component
+  code << "\n" << tab << "// Reset current active node and component to 0.0\n";
+  if (is_all_tensor && (max_dim >= 3)) {
+    code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n";
+  } else {
+    code << tab << "r_q_in[a][s] = 0.0;\n";
+  }
+
+  // -- End of loop over size of active field
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "input_offset += field_size_in;\n";
+
+  // -- End of loop over active field
+  tab.pop();
+  code << tab << "}\n";
+
+  // Close loop and function
+  tab.pop();
+  code << tab << "}\n";
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "// -----------------------------------------------------------------------------\n\n";
+
+  // Compile
+  {
+    bool          is_compile_good = false;
+    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+
+    data->thread_1d = T_1d;
+    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
+    } else {
+      *is_good_build              = false;
+      data->use_assembly_fallback = true;
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.h b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
index 28031e8e3b..8fd3ee12c5 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.h
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -6,4 +6,7 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op);
+CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build);
+CEED_INTERN int CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build);
+CEED_INTERN int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build);
+CEED_INTERN int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOperator op, bool *is_good_build);
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 840d97afb9..97fcf6b4b0 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,7 +8,10 @@
 #include <ceed.h>
 #include <ceed/backend.h>
 #include <ceed/jit-source/cuda/cuda-types.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
 #include <stddef.h>
+#include <string.h>
 
 #include "../cuda/ceed-cuda-common.h"
 #include "../cuda/ceed-cuda-compile.h"
@@ -19,10 +22,18 @@
 // Destroy operator
 //------------------------------------------------------------------------------
 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
+  Ceed                   ceed;
   CeedOperator_Cuda_gen *impl;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
+  if (impl->module) CeedCallCuda(ceed, cuModuleUnload(impl->module));
+  if (impl->module_assemble_full) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_full));
+  if (impl->module_assemble_diagonal) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_diagonal));
+  if (impl->module_assemble_qfunction) CeedCallCuda(ceed, cuModuleUnload(impl->module_assemble_qfunction));
+  if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -91,18 +102,23 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
 //------------------------------------------------------------------------------
 // Apply and add to output
 //------------------------------------------------------------------------------
-static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good,
+                                             CeedRequest *request) {
+  bool                    is_at_points, is_tensor;
   Ceed                    ceed;
   Ceed_Cuda              *cuda_data;
   CeedInt                 num_elem, num_input_fields, num_output_fields;
   CeedEvalMode            eval_mode;
-  CeedVector              output_vecs[CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
   CeedQFunction_Cuda_gen *qf_data;
   CeedQFunction           qf;
   CeedOperatorField      *op_input_fields, *op_output_fields;
   CeedOperator_Cuda_gen  *data;
 
+  // Build the operator kernel
+  CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, is_run_good));
+  if (!(*is_run_good)) return CEED_ERROR_SUCCESS;
+
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedGetData(ceed, &cuda_data));
   CeedCallBackend(CeedOperatorGetData(op, &data));
@@ -112,66 +128,72 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
-  // Check for tensor-product bases
-  {
-    bool has_tensor_bases;
-
-    CeedCallBackend(CeedOperatorHasTensorBases(op, &has_tensor_bases));
-    // -- Fallback to ref if not all bases are tensor-product
-    if (!has_tensor_bases) {
-      CeedOperator op_fallback;
-
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to non-tensor bases");
-      CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
-      CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
-      return CEED_ERROR_SUCCESS;
-    }
-  }
-
-  // Creation of the operator
-  CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op));
-
   // Input vectors
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedVector vec;
-
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
       data->fields.inputs[i] = NULL;
     } else {
+      bool       is_active;
+      CeedVector vec;
+
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
-      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) data->fields.inputs[i] = input_arr;
+      else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
   // Output vectors
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedVector vec;
-
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
       data->fields.outputs[i] = NULL;
     } else {
+      bool       is_active;
+      CeedVector vec;
+
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
-      output_vecs[i] = vec;
-      // Check for multiple output modes
-      CeedInt index = -1;
-
-      for (CeedInt j = 0; j < i; j++) {
-        if (vec == output_vecs[j]) {
-          index = j;
-          break;
-        }
-      }
-      if (index == -1) {
-        CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
-      } else {
-        data->fields.outputs[i] = data->fields.outputs[index];
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) data->fields.outputs[i] = output_arr;
+      else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+  }
+
+  // Point coordinates, if needed
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  if (is_at_points) {
+    // Coords
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+    CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+    CeedCallBackend(CeedVectorDestroy(&vec));
+
+    // Points per elem
+    if (num_elem != data->points.num_elem) {
+      CeedInt            *points_per_elem;
+      const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+      CeedElemRestriction rstr_points = NULL;
+
+      data->points.num_elem = num_elem;
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+      CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+      for (CeedInt e = 0; e < num_elem; e++) {
+        CeedInt num_points_elem;
+
+        CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+        points_per_elem[e] = num_points_elem;
       }
+      if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes));
+      CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+      CeedCallBackend(CeedFree(&points_per_elem));
     }
   }
 
@@ -179,66 +201,661 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
 
   // Apply operator
-  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W};
-  const CeedInt dim       = data->dim;
-  const CeedInt Q_1d      = data->Q_1d;
-  const CeedInt P_1d      = data->max_P_1d;
-  const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
-  int           max_threads_per_block, min_grid_size;
+  void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
+  int   max_threads_per_block, min_grid_size, grid;
 
+  CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
-  int block[3] =
-      {
-          thread_1d,
-          dim < 2 ? 1 : thread_1d,
-          -1,
-      },
-      grid;
-
-  CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
-                                     cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
+  int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
+
+  if (is_tensor) {
+    CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
+                                       cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
+  } else {
+    CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1));
+
+    grid     = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+    block[2] = elems_per_block;
+  }
   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
 
-  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs));
+  CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, stream, grid, block[0], block[1], block[2], shared_mem, is_run_good, opargs));
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedVector vec;
-
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      bool       is_active;
+      CeedVector vec;
+
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
-      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
   // Restore output arrays
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedVector vec;
-
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      bool       is_active;
+      CeedVector vec;
+
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
-      // Check for multiple output modes
-      CeedInt index = -1;
-      for (CeedInt j = 0; j < i; j++) {
-        if (vec == output_vecs[j]) {
-          index = j;
-          break;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+  }
+
+  // Restore point coordinates, if needed
+  if (is_at_points) {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+    CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+    CeedCallBackend(CeedVectorDestroy(&vec));
+  }
+
+  // Restore context data
+  CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+  // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  if (!(*is_run_good)) data->use_fallback = true;
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+  bool              is_run_good = false;
+  const CeedScalar *input_arr   = NULL;
+  CeedScalar       *output_arr  = NULL;
+
+  // Try to run kernel
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
+  CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, NULL, input_arr, output_arr, &is_run_good, request));
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
+
+  // Fallback on unsuccessful run
+  if (!is_run_good) {
+    CeedOperator op_fallback;
+
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for ApplyAdd\n");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedOperatorApplyAddComposite_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+  bool              is_run_good[CEED_COMPOSITE_MAX] = {false}, is_sequential;
+  CeedInt           num_suboperators;
+  const CeedScalar *input_arr  = NULL;
+  CeedScalar       *output_arr = NULL;
+  Ceed              ceed;
+  CeedOperator     *sub_operators;
+  cudaStream_t      stream = NULL;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+  CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
+  CeedCall(CeedOperatorCompositeIsSequential(op, &is_sequential));
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
+  if (is_sequential) CeedCallCuda(ceed, cudaStreamCreate(&stream));
+  for (CeedInt i = 0; i < num_suboperators; i++) {
+    CeedInt num_elem = 0;
+
+    CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
+    if (num_elem > 0) {
+      if (!is_sequential) CeedCallCuda(ceed, cudaStreamCreate(&stream));
+      CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(sub_operators[i], stream, input_arr, output_arr, &is_run_good[i], request));
+      if (!is_sequential) CeedCallCuda(ceed, cudaStreamDestroy(stream));
+    }
+  }
+  if (is_sequential) CeedCallCuda(ceed, cudaStreamDestroy(stream));
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
+  CeedCallCuda(ceed, cudaDeviceSynchronize());
+
+  // Fallback on unsuccessful run
+  for (CeedInt i = 0; i < num_suboperators; i++) {
+    if (!is_run_good[i]) {
+      CeedOperator op_fallback;
+
+      CeedDebug(ceed, "\nFalling back to /gpu/cuda/ref CeedOperator for ApplyAdd\n");
+      CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback));
+      CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// QFunction assembly
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
+                                                            CeedRequest *request) {
+  Ceed                   ceed;
+  CeedOperator_Cuda_gen *data;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+
+  // Build the assembly kernel
+  if (!data->assemble_qfunction && !data->use_assembly_fallback) {
+    bool is_build_good = false;
+
+    CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good));
+    if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(op, &is_build_good));
+    if (!is_build_good) data->use_assembly_fallback = true;
+  }
+
+  // Try assembly
+  if (!data->use_assembly_fallback) {
+    bool                    is_run_good = true;
+    Ceed_Cuda              *cuda_data;
+    CeedInt                 num_elem, num_input_fields, num_output_fields;
+    CeedEvalMode            eval_mode;
+    CeedScalar             *assembled_array;
+    CeedQFunctionField     *qf_input_fields, *qf_output_fields;
+    CeedQFunction_Cuda_gen *qf_data;
+    CeedQFunction           qf;
+    CeedOperatorField      *op_input_fields, *op_output_fields;
+
+    CeedCallBackend(CeedGetData(ceed, &cuda_data));
+    CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+    CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+    CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+    CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+    CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+    // Input vectors
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+        data->fields.inputs[i] = NULL;
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        // Get input vector
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) data->fields.inputs[i] = NULL;
+        else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Get context data
+    CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
+
+    // Build objects if needed
+    if (build_objects) {
+      CeedInt qf_size_in = 0, qf_size_out = 0, Q;
+
+      // Count number of active input fields
+      {
+        for (CeedInt i = 0; i < num_input_fields; i++) {
+          CeedInt    field_size;
+          CeedVector vec;
+
+          // Get input vector
+          CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+          // Check if active input
+          if (vec == CEED_VECTOR_ACTIVE) {
+            CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
+            qf_size_in += field_size;
+          }
+          CeedCallBackend(CeedVectorDestroy(&vec));
+        }
+        CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+      }
+
+      // Count number of active output fields
+      {
+        for (CeedInt i = 0; i < num_output_fields; i++) {
+          CeedInt    field_size;
+          CeedVector vec;
+
+          // Get output vector
+          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+          // Check if active output
+          if (vec == CEED_VECTOR_ACTIVE) {
+            CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+            qf_size_out += field_size;
+          }
+          CeedCallBackend(CeedVectorDestroy(&vec));
+        }
+        CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+      }
+      CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+
+      // Actually build objects now
+      const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
+      CeedInt        strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
+
+      // Create output restriction
+      CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out,
+                                                       (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides,
+                                                       rstr));
+      // Create assembled vector
+      CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
+    }
+
+    // Assembly array
+    CeedCallBackend(CeedVectorGetArrayWrite(*assembled, CEED_MEM_DEVICE, &assembled_array));
+
+    // Assemble QFunction
+    void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
+    bool  is_tensor = false;
+    int   max_threads_per_block, min_grid_size, grid;
+
+    CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
+    CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
+    int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
+
+    if (is_tensor) {
+      CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
+                                         cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
+    } else {
+      CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1));
+
+      grid     = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+      block[2] = elems_per_block;
+    }
+    CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
+
+    CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_qfunction, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good,
+                                                   opargs));
+    CeedCallCuda(ceed, cudaDeviceSynchronize());
+
+    // Restore input arrays
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Restore context data
+    CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+    // Restore assembly array
+    CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
+
+    // Cleanup
+    CeedCallBackend(CeedQFunctionDestroy(&qf));
+    if (!is_run_good) {
+      data->use_assembly_fallback = true;
+      if (build_objects) {
+        CeedCallBackend(CeedVectorDestroy(assembled));
+        CeedCallBackend(CeedElemRestrictionDestroy(rstr));
+      }
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+
+  // Fallback, if needed
+  if (data->use_assembly_fallback) {
+    CeedOperator op_fallback;
+
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for LinearAssemblyQFunction\n");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
+    return CEED_ERROR_SUCCESS;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedOperatorLinearAssembleQFunction_Cuda_gen(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(op, true, assembled, rstr, request);
+}
+
+static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda_gen(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(op, false, &assembled, &rstr, request);
+}
+
+//------------------------------------------------------------------------------
+// AtPoints diagonal assembly
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen(CeedOperator op, CeedVector assembled, CeedRequest *request) {
+  Ceed                   ceed;
+  CeedOperator_Cuda_gen *data;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+
+  // Build the assembly kernel
+  if (!data->assemble_diagonal && !data->use_assembly_fallback) {
+    bool                     is_build_good = false;
+    CeedInt                  num_active_bases_in, num_active_bases_out;
+    CeedOperatorAssemblyData assembly_data;
+
+    CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
+    CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL,
+                                                         NULL, NULL));
+    if (num_active_bases_in == num_active_bases_out) {
+      CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good));
+      if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(op, &is_build_good));
+    }
+    if (!is_build_good) data->use_assembly_fallback = true;
+  }
+
+  // Try assembly
+  if (!data->use_assembly_fallback) {
+    bool                    is_run_good = true;
+    Ceed_Cuda              *cuda_data;
+    CeedInt                 num_elem, num_input_fields, num_output_fields;
+    CeedEvalMode            eval_mode;
+    CeedScalar             *assembled_array;
+    CeedQFunctionField     *qf_input_fields, *qf_output_fields;
+    CeedQFunction_Cuda_gen *qf_data;
+    CeedQFunction           qf;
+    CeedOperatorField      *op_input_fields, *op_output_fields;
+
+    CeedCallBackend(CeedGetData(ceed, &cuda_data));
+    CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+    CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+    CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+    CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+    CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+    // Input vectors
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+        data->fields.inputs[i] = NULL;
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        // Get input vector
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) data->fields.inputs[i] = NULL;
+        else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+
+      // Points per elem
+      if (num_elem != data->points.num_elem) {
+        CeedInt            *points_per_elem;
+        const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+        CeedElemRestriction rstr_points = NULL;
+
+        data->points.num_elem = num_elem;
+        CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+        CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+        for (CeedInt e = 0; e < num_elem; e++) {
+          CeedInt num_points_elem;
+
+          CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+          points_per_elem[e] = num_points_elem;
+        }
+        if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem));
+        CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes));
+        CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+        CeedCallBackend(CeedFree(&points_per_elem));
+      }
+    }
+
+    // Get context data
+    CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
+
+    // Assembly array
+    CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
+
+    // Assemble diagonal
+    void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
+    int   max_threads_per_block, min_grid_size, grid;
+
+    CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
+    int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
+
+    CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, 1,
+                                       cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
+    CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
+
+    CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_diagonal, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good,
+                                                   opargs));
+    CeedCallCuda(ceed, cudaDeviceSynchronize());
+
+    // Restore input arrays
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Restore point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    // Restore context data
+    CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+    // Restore assembly array
+    CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
+
+    // Cleanup
+    CeedCallBackend(CeedQFunctionDestroy(&qf));
+    if (!is_run_good) data->use_assembly_fallback = true;
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+
+  // Fallback, if needed
+  if (data->use_assembly_fallback) {
+    CeedOperator op_fallback;
+
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for AtPoints LinearAssembleAddDiagonal\n");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request));
+    return CEED_ERROR_SUCCESS;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// AtPoints full assembly
+//------------------------------------------------------------------------------
+static int CeedOperatorAssembleSingleAtPoints_Cuda_gen(CeedOperator op, CeedInt offset, CeedVector assembled) {
+  Ceed                   ceed;
+  CeedOperator_Cuda_gen *data;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+
+  // Build the assembly kernel
+  if (!data->assemble_full && !data->use_assembly_fallback) {
+    bool                     is_build_good = false;
+    CeedInt                  num_active_bases_in, num_active_bases_out;
+    CeedOperatorAssemblyData assembly_data;
+
+    CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
+    CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL,
+                                                         NULL, NULL));
+    if (num_active_bases_in == num_active_bases_out) {
+      CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_build_good));
+      if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(op, &is_build_good));
+    }
+    if (!is_build_good) data->use_assembly_fallback = true;
+  }
+
+  // Try assembly
+  if (!data->use_assembly_fallback) {
+    bool                    is_run_good = true;
+    Ceed_Cuda              *cuda_data;
+    CeedInt                 num_elem, num_input_fields, num_output_fields;
+    CeedEvalMode            eval_mode;
+    CeedScalar             *assembled_array;
+    CeedQFunctionField     *qf_input_fields, *qf_output_fields;
+    CeedQFunction_Cuda_gen *qf_data;
+    CeedQFunction           qf;
+    CeedOperatorField      *op_input_fields, *op_output_fields;
+
+    CeedCallBackend(CeedGetData(ceed, &cuda_data));
+    CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+    CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+    CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+    CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+    CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+    // Input vectors
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+        data->fields.inputs[i] = NULL;
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        // Get input vector
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) data->fields.inputs[i] = NULL;
+        else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+
+      // Points per elem
+      if (num_elem != data->points.num_elem) {
+        CeedInt            *points_per_elem;
+        const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+        CeedElemRestriction rstr_points = NULL;
+
+        data->points.num_elem = num_elem;
+        CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+        CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+        for (CeedInt e = 0; e < num_elem; e++) {
+          CeedInt num_points_elem;
+
+          CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+          points_per_elem[e] = num_points_elem;
         }
+        if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem));
+        CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes));
+        CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+        CeedCallBackend(CeedFree(&points_per_elem));
       }
-      if (index == -1) {
-        CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
+    }
+
+    // Get context data
+    CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
+
+    // Assembly array
+    CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
+    CeedScalar *assembled_offset_array = &assembled_array[offset];
+
+    // Assemble diagonal
+    void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields,          &data->B,
+                      &data->G,          &data->W,      &data->points,  &assembled_offset_array};
+    int   max_threads_per_block, min_grid_size, grid;
+
+    CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
+    int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
+
+    CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, 1,
+                                       cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
+    CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
+
+    CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->assemble_full, NULL, grid, block[0], block[1], block[2], shared_mem, &is_run_good,
+                                                   opargs));
+    CeedCallCuda(ceed, cudaDeviceSynchronize());
+
+    // Restore input arrays
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
       }
     }
+
+    // Restore point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    // Restore context data
+    CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+    // Restore assembly array
+    CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
+
+    // Cleanup
+    CeedCallBackend(CeedQFunctionDestroy(&qf));
+    if (!is_run_good) data->use_assembly_fallback = true;
   }
+  CeedCallBackend(CeedDestroy(&ceed));
 
-  // Restore context data
-  CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+  // Fallback, if needed
+  if (data->use_assembly_fallback) {
+    CeedOperator op_fallback;
+
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for AtPoints SingleOperatorAssemble\n");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorAssembleSingle(op_fallback, offset, assembled));
+    return CEED_ERROR_SUCCESS;
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -246,14 +863,32 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
 // Create operator
 //------------------------------------------------------------------------------
 int CeedOperatorCreate_Cuda_gen(CeedOperator op) {
+  bool                   is_composite, is_at_points;
   Ceed                   ceed;
   CeedOperator_Cuda_gen *impl;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedOperatorSetData(op, impl));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen));
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  if (is_composite) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Cuda_gen));
+  } else {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen));
+  }
+  CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+  if (is_at_points) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal",
+                                           CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Cuda_gen));
+  }
+  if (!is_at_points) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda_gen));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate",
+                                           CeedOperatorLinearAssembleQFunctionUpdate_Cuda_gen));
+  }
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen-qfunction.c b/backends/cuda-gen/ceed-cuda-gen-qfunction.c
index ccff67a476..38c5cc9ee1 100644
--- a/backends/cuda-gen/ceed-cuda-gen-qfunction.c
+++ b/backends/cuda-gen/ceed-cuda-gen-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -27,7 +27,6 @@ static int CeedQFunctionDestroy_Cuda_gen(CeedQFunction qf) {
 
   CeedCallBackend(CeedQFunctionGetData(qf, &data));
   CeedCallCuda(CeedQFunctionReturnCeed(qf), cudaFree(data->d_c));
-  CeedCallBackend(CeedFree(&data->qfunction_source));
   CeedCallBackend(CeedFree(&data));
   return CEED_ERROR_SUCCESS;
 }
@@ -43,15 +42,11 @@ int CeedQFunctionCreate_Cuda_gen(CeedQFunction qf) {
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedQFunctionSetData(qf, data));
 
-  // Read QFunction source
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
-  CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");
-  CeedCheck(data->qfunction_source, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Cuda_gen));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c
index e1833be2a2..799c35fd1e 100644
--- a/backends/cuda-gen/ceed-cuda-gen.c
+++ b/backends/cuda-gen/ceed-cuda-gen.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -18,8 +18,7 @@
 //------------------------------------------------------------------------------
 static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
   char      *resource_root;
-  const char fallback_resource[] = "/gpu/cuda/ref";
-  Ceed       ceed_shared;
+  Ceed       ceed_shared, ceed_ref;
   Ceed_Cuda *data;
 
   CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root));
@@ -31,13 +30,18 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetData(ceed, data));
   CeedCallBackend(CeedInit_Cuda(ceed, resource));
 
-  CeedCall(CeedInit("/gpu/cuda/shared", &ceed_shared));
+  CeedCallBackend(CeedInit("/gpu/cuda/shared", &ceed_shared));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
+  CeedCallBackend(CeedDestroy(&ceed_shared));
 
-  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource));
+  CeedCallBackend(CeedInit("/gpu/cuda/ref", &ceed_ref));
+  CeedCallBackend(CeedSetOperatorFallbackCeed(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda_gen));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "CompositeOperatorCreate", CeedOperatorCreate_Cuda_gen));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreate_Cuda_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
index d10dece242..0e04f3c4e4 100644
--- a/backends/cuda-gen/ceed-cuda-gen.h
+++ b/backends/cuda-gen/ceed-cuda-gen.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -12,21 +12,23 @@
 #include <cuda.h>
 
 typedef struct {
+  bool           use_fallback, use_assembly_fallback;
   CeedInt        dim;
-  CeedInt        Q_1d;
+  CeedInt        Q, Q_1d;
   CeedInt        max_P_1d;
-  CUmodule       module;
-  CUfunction     op;
+  CeedInt        thread_1d;
+  CUmodule       module, module_assemble_full, module_assemble_diagonal, module_assemble_qfunction;
+  CUfunction     op, assemble_full, assemble_diagonal, assemble_qfunction;
   FieldsInt_Cuda indices;
   Fields_Cuda    fields;
   Fields_Cuda    B;
   Fields_Cuda    G;
   CeedScalar    *W;
+  Points_Cuda    points;
 } CeedOperator_Cuda_gen;
 
 typedef struct {
   const char *qfunction_name;
-  const char *qfunction_source;
   void       *d_c;
 } CeedQFunction_Cuda_gen;
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index 529c538182..7ed1865a95 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -10,6 +10,7 @@
 #include <ceed/jit-tools.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <string.h>
 
 #include "../cuda/ceed-cuda-common.h"
 #include "../cuda/ceed-cuda-compile.h"
@@ -18,7 +19,8 @@
 //------------------------------------------------------------------------------
 // Basis apply - tensor
 //------------------------------------------------------------------------------
-int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+static int CeedBasisApplyCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                   CeedVector u, CeedVector v) {
   Ceed              ceed;
   CeedInt           Q_1d, dim;
   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -33,14 +35,12 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
-
-  // Clear v for transpose operation
-  if (is_transpose) {
-    CeedSize length;
-
-    CeedCallBackend(CeedVectorGetLength(v, &length));
-    CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
@@ -60,6 +60,7 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->Grad, num_elem, block_size, grad_args));
     } break;
     case CEED_EVAL_WEIGHT: {
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
       void     *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
       const int block_size_x  = Q_1d;
       const int block_size_y  = dim >= 2 ? Q_1d : 1;
@@ -79,14 +80,180 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                               CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAdd_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                  CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Basis apply - tensor AtPoints
+//------------------------------------------------------------------------------
+static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
+                                           CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  Ceed              ceed;
+  CeedInt           Q_1d, dim, max_num_points = num_points[0];
+  const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
+  const int         max_block_size = 32;
+  const CeedScalar *d_x, *d_u;
+  CeedScalar       *d_v;
+  CeedBasis_Cuda   *data;
+
+  CeedCallBackend(CeedBasisGetData(basis, &data));
+  CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+
+  // Weight handled separately
+  if (eval_mode == CEED_EVAL_WEIGHT) {
+    CeedCallBackend(CeedVectorSetValue(v, 1.0));
+    return CEED_ERROR_SUCCESS;
+  }
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
+    }
+  }
+
+  // Build kernels if needed
+  if (data->num_points != max_num_points) {
+    CeedInt P_1d;
+
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    data->num_points = max_num_points;
+
+    // -- Create interp matrix to Chebyshev coefficients
+    if (!data->d_chebyshev_interp_1d) {
+      CeedSize    interp_bytes;
+      CeedScalar *chebyshev_interp_1d;
+
+      interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+      CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+      CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
+      CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
+      CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+    }
+
+    // -- Compile kernels
+    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h>\n";
+    CeedInt    num_comp;
+
+    if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
+                                     Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
+                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
+  }
+
+  // Get read/write access to u, v
+  CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  }
+
+  // Basis action
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+
+      CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size,
+                                         interp_args));
+    } break;
+    case CEED_EVAL_GRAD: {
+      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+
+      CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
+    } break;
+    case CEED_EVAL_WEIGHT:
+    case CEED_EVAL_NONE: /* handled separately below */
+      break;
+    // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+
+  // Restore vectors, cover CEED_EVAL_NONE
+  CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
+  if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                       CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddAtPoints_Cuda(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                          CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Basis apply - non-tensor
 //------------------------------------------------------------------------------
-int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                 CeedVector v) {
+static int CeedBasisApplyNonTensorCore_Cuda(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                            CeedVector u, CeedVector v) {
   Ceed                     ceed;
   CeedInt                  num_nodes, num_qpts;
   const CeedInt            is_transpose    = t_mode == CEED_TRANSPOSE;
@@ -104,14 +271,12 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
-
-  // Clear v for transpose operation
-  if (is_transpose) {
-    CeedSize length;
-
-    CeedCallBackend(CeedVectorGetLength(v, &length));
-    CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar)));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
 
   // Apply basis operation
@@ -157,6 +322,7 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr
       }
     } break;
     case CEED_EVAL_WEIGHT: {
+      CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v};
 
       CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args));
@@ -169,6 +335,19 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                        CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                           CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda(basis, true, num_elem, t_mode, eval_mode, u, v));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -182,10 +361,15 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallCuda(ceed, cuModuleUnload(data->module));
-  CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
+  if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
+  if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
+  CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));
   CeedCallBackend(CeedFree(&data));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -199,12 +383,13 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallCuda(ceed, cuModuleUnload(data->module));
-  CeedCallCuda(ceed, cudaFree(data->d_q_weight));
+  if (data->d_q_weight) CeedCallCuda(ceed, cudaFree(data->d_q_weight));
   CeedCallCuda(ceed, cudaFree(data->d_interp));
   CeedCallCuda(ceed, cudaFree(data->d_grad));
   CeedCallCuda(ceed, cudaFree(data->d_div));
   CeedCallCuda(ceed, cudaFree(data->d_curl));
   CeedCallBackend(CeedFree(&data));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -214,8 +399,6 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) {
 int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                  const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
   Ceed            ceed;
-  char           *basis_kernel_source;
-  const char     *basis_kernel_path;
   CeedInt         num_comp;
   const CeedInt   q_bytes      = Q_1d * sizeof(CeedScalar);
   const CeedInt   interp_bytes = q_bytes * P_1d;
@@ -225,33 +408,35 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallBackend(CeedCalloc(1, &data));
 
   // Copy data to GPU
-  CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
-  CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice));
+  if (q_weight_1d) {
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice));
+  }
   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes));
   CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice));
   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes));
   CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice));
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
-                                   num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                   Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                    "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -261,8 +446,6 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
 int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
                            const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                     ceed;
-  char                    *basis_kernel_source;
-  const char              *basis_kernel_path;
   CeedInt                  num_comp, q_comp_interp, q_comp_grad;
   const CeedInt            q_bytes = num_qpts * sizeof(CeedScalar);
   CeedBasisNonTensor_Cuda *data;
@@ -273,8 +456,10 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
-  CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
-  CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  if (q_weight) {
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  }
   if (interp) {
     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
 
@@ -289,11 +474,9 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
                                    q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
@@ -301,14 +484,14 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -318,8 +501,6 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
 int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div,
                              const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                     ceed;
-  char                    *basis_kernel_source;
-  const char              *basis_kernel_path;
   CeedInt                  num_comp, q_comp_interp, q_comp_div;
   const CeedInt            q_bytes = num_qpts * sizeof(CeedScalar);
   CeedBasisNonTensor_Cuda *data;
@@ -330,8 +511,10 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
-  CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
-  CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  if (q_weight) {
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  }
   if (interp) {
     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
 
@@ -346,11 +529,9 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
                                    q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
@@ -358,14 +539,14 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -375,8 +556,6 @@ int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
 int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
                               const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                     ceed;
-  char                    *basis_kernel_source;
-  const char              *basis_kernel_path;
   CeedInt                  num_comp, q_comp_interp, q_comp_curl;
   const CeedInt            q_bytes = num_qpts * sizeof(CeedScalar);
   CeedBasisNonTensor_Cuda *data;
@@ -387,8 +566,10 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
-  CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
-  CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  if (q_weight) {
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  }
   if (interp) {
     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
 
@@ -403,11 +584,9 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-nontensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
                                    q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
@@ -415,14 +594,14 @@ int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_no
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 670eb14e12..b4531fde50 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -27,20 +27,28 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Apply data
-  for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
-    CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
-  }
-  CeedCallBackend(CeedFree(&impl->e_vecs));
+  CeedCallBackend(CeedFree(&impl->num_points));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_in));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_out));
+  CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
+  CeedCallBackend(CeedFree(&impl->input_field_order));
+  CeedCallBackend(CeedFree(&impl->output_field_order));
+  CeedCallBackend(CeedFree(&impl->input_states));
 
   for (CeedInt i = 0; i < impl->num_inputs; i++) {
+    CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i]));
     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i]));
   }
+  CeedCallBackend(CeedFree(&impl->e_vecs_in));
   CeedCallBackend(CeedFree(&impl->q_vecs_in));
 
   for (CeedInt i = 0; i < impl->num_outputs; i++) {
+    CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i]));
     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
   }
+  CeedCallBackend(CeedFree(&impl->e_vecs_out));
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
+  CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem));
 
   // QFunction assembly data
   for (CeedInt i = 0; i < impl->num_active_in; i++) {
@@ -70,10 +78,11 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
     CeedCallCuda(ceed, cudaFree(impl->diag->d_div_out));
     CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_in));
     CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_out));
-    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
-    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
+    CeedCallBackend(CeedDestroy(&ceed));
     CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag));
     CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag));
+    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
+    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
   }
   CeedCallBackend(CeedFree(&impl->diag));
 
@@ -84,6 +93,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
     CeedCallCuda(ceed, cuModuleUnload(impl->asmb->module));
     CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_in));
     CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_out));
+    CeedCallBackend(CeedDestroy(&ceed));
   }
   CeedCallBackend(CeedFree(&impl->asmb));
 
@@ -94,8 +104,8 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
 //------------------------------------------------------------------------------
 // Setup infields or outfields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e,
-                                        CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
+static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, bool *apply_add_basis,
+                                        CeedVector *e_vecs, CeedVector *q_vecs, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
   Ceed                ceed;
   CeedQFunctionField *qf_fields;
   CeedOperatorField  *op_fields;
@@ -111,68 +121,115 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
 
   // Loop over fields
   for (CeedInt i = 0; i < num_fields; i++) {
-    bool         is_strided = false, skip_restriction = false;
-    CeedSize     q_size;
-    CeedInt      size;
-    CeedEvalMode eval_mode;
-    CeedBasis    basis;
+    bool                is_active = false, is_strided = false, skip_e_vec = false;
+    CeedSize            q_size;
+    CeedInt             size;
+    CeedEvalMode        eval_mode;
+    CeedVector          l_vec;
+    CeedElemRestriction elem_rstr;
 
+    // Check whether this field can skip the element restriction:
+    // Input CEED_VECTOR_ACTIVE
+    // Output CEED_VECTOR_ACTIVE without CEED_EVAL_NONE
+    // Input CEED_VECTOR_NONE with CEED_EVAL_WEIGHT
+    // Input passive vector with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND
+    CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
-    if (eval_mode != CEED_EVAL_WEIGHT) {
-      CeedElemRestriction elem_rstr;
-
-      // Check whether this field can skip the element restriction:
-      // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
-
-      // First, check whether the field is input or output:
-      if (is_input) {
-        CeedVector vec;
-
-        // Check for passive input
-        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
-        if (vec != CEED_VECTOR_ACTIVE) {
-          // Check eval_mode
-          if (eval_mode == CEED_EVAL_NONE) {
-            // Check for strided restriction
-            CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-            if (is_strided) {
-              // Check if vector is already in preferred backend ordering
-              CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction));
-            }
-          }
-        }
-      }
-      if (skip_restriction) {
-        // We do not need an E-Vector, but will use the input field vector's data directly in the operator application.
-        e_vecs[i + start_e] = NULL;
-      } else {
-        CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e]));
-      }
+    skip_e_vec = (is_input && is_active) || (is_active && eval_mode != CEED_EVAL_NONE) || (eval_mode == CEED_EVAL_WEIGHT);
+    if (!skip_e_vec && is_input && !is_active && eval_mode == CEED_EVAL_NONE) {
+      CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
+      if (is_strided) CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_e_vec));
     }
+    if (skip_e_vec) {
+      e_vecs[i] = NULL;
+    } else {
+      CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i]));
+    }
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
-        q_size = (CeedSize)num_elem * Q * size;
-        CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
-        break;
       case CEED_EVAL_INTERP:
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
-        q_size = (CeedSize)num_elem * Q * size;
+        q_size = (CeedSize)num_elem * (CeedSize)Q * (CeedSize)size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         break;
-      case CEED_EVAL_WEIGHT:  // Only on input fields
+      case CEED_EVAL_WEIGHT: {
+        CeedBasis basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
-        q_size = (CeedSize)num_elem * Q;
+        q_size = (CeedSize)num_elem * (CeedSize)Q;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        if (is_at_points) {
+          CeedInt num_points[num_elem];
+
+          for (CeedInt i = 0; i < num_elem; i++) num_points[i] = Q;
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE,
+                                                 q_vecs[i]));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
+    }
+  }
+  // Drop duplicate restrictions
+  if (is_input) {
+    for (CeedInt i = 0; i < num_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          skip_rstr[j] = true;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  } else {
+    for (CeedInt i = num_fields - 1; i >= 0; i--) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i - 1; j >= 0; j--) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          skip_rstr[j]       = true;
+          apply_add_basis[i] = true;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -180,7 +237,6 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
 //------------------------------------------------------------------------------
 static int CeedOperatorSetup_Cuda(CeedOperator op) {
-  Ceed                ceed;
   bool                is_setup_done;
   CeedInt             Q, num_elem, num_input_fields, num_output_fields;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
@@ -191,7 +247,6 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
   if (is_setup_done) return CEED_ERROR_SUCCESS;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -200,159 +255,625 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Allocate
-  CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->e_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->e_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->apply_add_basis_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_field_order));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->output_field_order));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_states));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->q_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->q_vecs_out));
   impl->num_inputs  = num_input_fields;
   impl->num_outputs = num_output_fields;
 
-  // Set up infield and outfield e_vecs and q_vecs
-  // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
-  // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
+  // Set up infield and outfield e-vecs and q-vecs
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields, Q,
+                                               num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out,
+                                               impl->q_vecs_out, num_output_fields, Q, num_elem));
+
+  // Reorder fields to allow reuse of buffers
+  impl->max_active_e_vec_len = 0;
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedSize            e_vec_len_i;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      is_ordered[i]                       = true;
+      impl->input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) {
+        // CEED_EVAL_WEIGHT
+        CeedCallBackend(CeedVectorDestroy(&vec_i));
+        continue;
+      };
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                       = true;
+          impl->input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedSize            e_vec_len_i;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      is_ordered[i]                        = true;
+      impl->output_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
+      for (CeedInt j = i + 1; j < num_output_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                        = true;
+          impl->output_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len));
+  {
+    // Create two work vectors for diagonal assembly
+    CeedVector temp_1, temp_2;
 
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_1));
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_2));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_1));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_2));
+  }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
-// Setup Operator Inputs
+// Restrict Operator Inputs
 //------------------------------------------------------------------------------
-static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
-                                               CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
-                                               CeedOperator_Cuda *impl, CeedRequest *request) {
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
+static inline int CeedOperatorInputRestrict_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                                 CeedVector in_vec, CeedVector active_e_vec, const bool skip_active, CeedOperator_Cuda *impl,
+                                                 CeedRequest *request) {
+  bool       is_active = false;
+  CeedVector l_vec, e_vec = impl->e_vecs_in[input_field];
+
+  // Get input vector
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
+  }
 
-    // Get input vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) {
-      if (skip_active) continue;
-      else vec = in_vec;
-    }
+  // Restriction action
+  if (e_vec) {
+    // Restrict, if necessary
+    if (!impl->skip_rstr_in[input_field]) {
+      uint64_t state;
 
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
-    } else {
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      // Get input element restriction
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-      if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
-      // Restrict, if necessary
-      if (!impl->e_vecs[i]) {
-        // No restriction for this field; read data directly from vec.
-        CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
-      } else {
-        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
-        // Get evec
-        CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
+      CeedCallBackend(CeedVectorGetState(l_vec, &state));
+      if (is_active || state != impl->input_states[input_field]) {
+        CeedElemRestriction elem_rstr;
+
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, l_vec, e_vec, request));
+        CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       }
+      impl->input_states[input_field] = state;
     }
   }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Input Basis Action
 //------------------------------------------------------------------------------
-static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
-                                              CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
+static inline int CeedOperatorInputBasis_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                              CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const bool skip_active,
                                               CeedOperator_Cuda *impl) {
+  bool         is_active = false;
+  CeedEvalMode eval_mode;
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
+
+  // Skip active input
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
+  }
+
+  // Basis action
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  switch (eval_mode) {
+    case CEED_EVAL_NONE: {
+      const CeedScalar *e_vec_array;
+
+      if (e_vec) {
+        CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, &e_vec_array));
+      } else {
+        CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, &e_vec_array));
+      }
+      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
+      break;
+    }
+    case CEED_EVAL_INTERP:
+    case CEED_EVAL_GRAD:
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL: {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
+      CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, e_vec, q_vec));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      break;
+    }
+    case CEED_EVAL_WEIGHT:
+      break;  // No action
+  }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Restore Input Vectors
+//------------------------------------------------------------------------------
+static inline int CeedOperatorInputRestore_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                                CeedVector in_vec, CeedVector active_e_vec, const bool skip_active, CeedOperator_Cuda *impl) {
+  bool         is_active = false;
+  CeedEvalMode eval_mode;
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field];
+
+  // Skip active input
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
+  }
+
+  // Restore e-vec
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  if (eval_mode == CEED_EVAL_NONE) {
+    const CeedScalar *e_vec_array;
+
+    CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, (CeedScalar **)&e_vec_array));
+    if (e_vec) {
+      CeedCallBackend(CeedVectorRestoreArrayRead(e_vec, &e_vec_array));
+    } else {
+      CeedCallBackend(CeedVectorRestoreArrayRead(l_vec, &e_vec_array));
+    }
+  }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Apply and add to output
+//------------------------------------------------------------------------------
+static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
+  CeedInt             Q, num_elem, num_input_fields, num_output_fields;
+  Ceed                ceed;
+  CeedVector          active_e_vec;
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Cuda  *impl;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Setup
+  CeedCallBackend(CeedOperatorSetup_Cuda(op));
+
+  // Work vector
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec));
+
+  // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt             elem_size, size;
-    CeedEvalMode        eval_mode;
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+    CeedInt field = impl->input_field_order[i];
+
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl,
+                                                   request));
+    CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem, false, impl));
+  }
+
+  // Output pointers, as necessary
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
 
-    // Skip active input
-    if (skip_active) {
-      CeedVector vec;
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedScalar *e_vec_array;
 
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
     }
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
+  }
+
+  // Q function
+  CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));
+
+  // Restore input arrays
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, in_vec, active_e_vec, false, impl));
+  }
+
+  // Output basis and restriction
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool         is_active = false;
+    CeedInt      field     = impl->output_field_order[i];
+    CeedEvalMode eval_mode;
+    CeedVector   l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
+
+    // Output vector
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
+      l_vec = out_vec;
+      if (!e_vec) e_vec = active_e_vec;
+    }
+
     // Basis action
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode));
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
-        break;
+        break;  // No action
       case CEED_EVAL_INTERP:
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i]));
+      case CEED_EVAL_CURL: {
+        CeedBasis basis;
+
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis));
+        if (impl->apply_add_basis_out[field]) {
+          CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec));
+        }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
-      case CEED_EVAL_WEIGHT:
-        break;  // No action
+      }
+      // LCOV_EXCL_START
+      case CEED_EVAL_WEIGHT: {
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        // LCOV_EXCL_STOP
+      }
     }
+
+    // Restore evec
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[field], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
+    }
+
+    // Restrict
+    if (!impl->skip_rstr_out[field]) {
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   }
+
+  // Return work vector
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
-// Restore Input Vectors
+// CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
 //------------------------------------------------------------------------------
-static inline int CeedOperatorRestoreInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
-                                                 const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) {
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode eval_mode;
-    CeedVector   vec;
+static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
+  bool                is_setup_done;
+  CeedInt             max_num_points = -1, num_elem, num_input_fields, num_output_fields;
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Cuda  *impl;
+
+  CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
+  if (is_setup_done) return CEED_ERROR_SUCCESS;
+
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  {
+    CeedElemRestriction rstr_points = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedCalloc(num_elem, &impl->num_points));
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt num_points_elem;
 
-    // Skip active input
-    if (skip_active) {
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+      impl->num_points[e] = num_points_elem;
     }
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
-    } else {
-      if (!impl->e_vecs[i]) {  // This was a skip_restriction case
-        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-        CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i]));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+  impl->max_num_points = max_num_points;
+
+  // Allocate
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->e_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->e_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->apply_add_basis_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_field_order));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->output_field_order));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_states));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->q_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->q_vecs_out));
+  impl->num_inputs  = num_input_fields;
+  impl->num_outputs = num_output_fields;
+
+  // Set up infield and outfield e-vecs and q-vecs
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields,
+                                               max_num_points, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out,
+                                               impl->q_vecs_out, num_output_fields, max_num_points, num_elem));
+
+  // Reorder fields to allow reuse of buffers
+  impl->max_active_e_vec_len = 0;
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedSize            e_vec_len_i;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      is_ordered[i]                       = true;
+      impl->input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) {
+        // CEED_EVAL_WEIGHT
+        CeedCallBackend(CeedVectorDestroy(&vec_i));
+        continue;
+      };
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                       = true;
+          impl->input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedSize            e_vec_len_i;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      is_ordered[i]                        = true;
+      impl->output_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
+      for (CeedInt j = i + 1; j < num_output_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                        = true;
+          impl->output_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len));
+  {
+    // Create two work vectors for diagonal assembly
+    CeedVector temp_1, temp_2;
+
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_1));
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_2));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_1));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_2));
+  }
+  CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Input Basis Action AtPoints
+//------------------------------------------------------------------------------
+static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                                      CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const CeedInt *num_points,
+                                                      const bool skip_active, const bool skip_passive, CeedOperator_Cuda *impl) {
+  bool         is_active = false;
+  CeedEvalMode eval_mode;
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
+
+  // Skip active input
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (skip_active && is_active) return CEED_ERROR_SUCCESS;
+  if (skip_passive && !is_active) {
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
+    return CEED_ERROR_SUCCESS;
+  }
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
+  }
+
+  // Basis action
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  switch (eval_mode) {
+    case CEED_EVAL_NONE: {
+      const CeedScalar *e_vec_array;
+
+      if (e_vec) {
+        CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, &e_vec_array));
       } else {
-        CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i]));
+        CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, &e_vec_array));
       }
+      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
+      break;
     }
+    case CEED_EVAL_INTERP:
+    case CEED_EVAL_GRAD:
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL: {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
+      CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, e_vec, q_vec));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      break;
+    }
+    case CEED_EVAL_WEIGHT:
+      break;  // No action
   }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
-// Apply and add to output
+// Apply and add to output AtPoints
 //------------------------------------------------------------------------------
-static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
-  CeedInt             Q, num_elem, elem_size, num_input_fields, num_output_fields, size;
-  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
+static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
+  CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
+  Ceed                ceed;
+  CeedVector          active_e_vec;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Cuda  *impl;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Setup
-  CeedCallBackend(CeedOperatorSetup_Cuda(op));
+  CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
+  num_points     = impl->num_points;
+  max_num_points = impl->max_num_points;
 
-  // Input Evecs and Restriction
-  CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
+  // Work vector
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec));
 
-  // Input basis apply if needed
-  CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
+  // Get point coordinates
+  {
+    CeedVector          point_coords = NULL;
+    CeedElemRestriction rstr_points  = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
+    if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
+    {
+      uint64_t state;
+      CeedCallBackend(CeedVectorGetState(point_coords, &state));
+      if (impl->points_state != state) {
+        CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+      }
+    }
+    CeedCallBackend(CeedVectorDestroy(&point_coords));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+
+  // Process inputs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedInt field = impl->input_field_order[i];
+
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl,
+                                                   request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem,
+                                                        num_points, false, false, impl));
+  }
 
   // Output pointers, as necessary
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -360,68 +881,86 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
     }
   }
 
   // Q function
-  CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));
+  CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
+
+  // Restore input arrays
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, in_vec, active_e_vec, false, impl));
+  }
 
-  // Output basis apply if needed
+  // Output basis and restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+    bool         is_active = false;
+    CeedInt      field     = impl->output_field_order[i];
+    CeedEvalMode eval_mode;
+    CeedVector   l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
+
+    // Output vector
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
+      l_vec = out_vec;
+      if (!e_vec) e_vec = active_e_vec;
+    }
 
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
     // Basis action
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode));
     switch (eval_mode) {
       case CEED_EVAL_NONE:
         break;  // No action
       case CEED_EVAL_INTERP:
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+      case CEED_EVAL_CURL: {
+        CeedBasis basis;
+
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis));
+        if (impl->apply_add_basis_out[field]) {
+          CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+        } else {
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+        }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
         // LCOV_EXCL_STOP
       }
     }
-  }
-
-  // Output restriction
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
 
     // Restore evec
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[field], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
     }
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+
     // Restrict
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    // Active
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
+    if (!impl->skip_rstr_out[field]) {
+      CeedElemRestriction elem_rstr;
 
-    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   }
 
-  // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+  // Restore work vector
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -432,7 +971,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
                                                                CeedRequest *request) {
   Ceed                ceed, ceed_parent;
   CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
-  CeedScalar         *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedScalar         *assembled_array;
   CeedVector         *active_inputs;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -453,19 +992,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
   // Setup
   CeedCallBackend(CeedOperatorSetup_Cuda(op));
 
-  // Input Evecs and Restriction
-  CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
+  // Process inputs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, true, impl));
+  }
 
   // Count number of active input fields
   if (!num_active_in) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
       CeedScalar *q_vec_array;
-      CeedVector  vec;
+      CeedVector  l_vec;
 
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
-      if (vec == CEED_VECTOR_ACTIVE) {
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
+      if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
         CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array));
@@ -474,12 +1015,13 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
           CeedSize q_size = (CeedSize)Q * num_elem;
 
           CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field]));
-          CeedCallBackend(
-              CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem]));
+          CeedCallBackend(CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER,
+                                             &q_vec_array[field * Q * num_elem]));
         }
         num_active_in += size;
         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
       }
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
     }
     impl->num_active_in = num_active_in;
     impl->qf_active_in  = active_inputs;
@@ -488,15 +1030,15 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
   // Count number of active output fields
   if (!num_active_out) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      CeedVector vec;
+      CeedVector l_vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
-      if (vec == CEED_VECTOR_ACTIVE) {
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &l_vec));
+      if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
         num_active_out += size;
       }
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
     }
     impl->num_active_out = num_active_out;
   }
@@ -511,16 +1053,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
 
     // Create output restriction
     CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
-                                                     num_active_in * num_active_out * num_elem * Q, strides, rstr));
+                                                     (CeedSize)num_active_in * (CeedSize)num_active_out * (CeedSize)num_elem * (CeedSize)Q, strides,
+                                                     rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
   }
   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
   CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array));
 
-  // Input basis apply
-  CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
-
   // Assemble QFunction
   for (CeedInt in = 0; in < num_active_in; in++) {
     // Set Inputs
@@ -530,38 +1070,42 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
     }
     // Set Outputs
     for (CeedInt out = 0; out < num_output_fields; out++) {
-      CeedVector vec;
+      CeedVector l_vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
       // Check if active output
-      if (vec == CEED_VECTOR_ACTIVE) {
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec));
+      if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array));
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
         assembled_array += size * Q * num_elem;  // Advance the pointer by the size of the output
       }
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
     }
     // Apply QFunction
     CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
   }
 
-  // Un-set output q_vecs to prevent accidental overwrite of Assembled
+  // Un-set output q-vecs to prevent accidental overwrite of Assembled
   for (CeedInt out = 0; out < num_output_fields; out++) {
-    CeedVector vec;
+    CeedVector l_vec;
 
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
-    // Check if active output
-    if (vec == CEED_VECTOR_ACTIVE) {
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec));
+    if (l_vec == CEED_VECTOR_ACTIVE) {
       CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL));
     }
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
   }
 
   // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl));
+  }
 
   // Restore output
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -605,13 +1149,14 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedBasis    basis;
       CeedEvalMode eval_mode;
+      CeedBasis    basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND,
                 "Backend does not implement operator diagonal assembly with multiple active bases");
-      basis_in = basis;
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -621,6 +1166,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
         num_eval_modes_in += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis
@@ -637,7 +1183,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
                 "Backend does not implement operator diagonal assembly with multiple active bases");
-      basis_out = basis;
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -647,6 +1194,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
         num_eval_modes_out += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Operator data struct
@@ -758,6 +1306,10 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
   CeedCallCuda(ceed, cudaMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, cudaMemcpyHostToDevice));
   CeedCallBackend(CeedFree(&eval_modes_in));
   CeedCallBackend(CeedFree(&eval_modes_out));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -766,8 +1318,6 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
 //------------------------------------------------------------------------------
 static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) {
   Ceed                ceed;
-  char               *diagonal_kernel_source;
-  const char         *diagonal_kernel_path;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
   CeedInt             num_comp, q_comp, num_nodes, num_qpts;
   CeedBasis           basis_in = NULL, basis_out = NULL;
@@ -789,14 +1339,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op,
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
       CeedEvalMode eval_mode;
+      CeedBasis    basis;
 
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in));
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
         num_eval_modes_in += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis
@@ -808,14 +1362,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op,
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
       CeedEvalMode eval_mode;
+      CeedBasis    basis;
 
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out));
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
         num_eval_modes_out += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Operator data struct
@@ -823,22 +1381,22 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op,
   CeedOperatorDiag_Cuda *diag = impl->diag;
 
   // Assemble kernel
-  CUmodule *module          = is_point_block ? &diag->module_point_block : &diag->module;
-  CeedInt   elems_per_block = 1;
+  const char diagonal_kernel_source[] = "// Diagonal assembly source\n#include <ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h>\n";
+  CUmodule  *module                   = is_point_block ? &diag->module_point_block : &diag->module;
+  CeedInt    elems_per_block          = 1;
+
   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
   CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
   if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
   else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n");
   CeedCallCuda(ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
                                       num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE",
                                       use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block));
   CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
-  CeedCallBackend(CeedFree(&diagonal_kernel_path));
-  CeedCallBackend(CeedFree(&diagonal_kernel_source));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -890,6 +1448,8 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVec
     CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr));
     CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag));
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr;
   elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag;
   CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0));
@@ -923,6 +1483,7 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVec
   CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request));
 
   // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
   return CEED_ERROR_SUCCESS;
 }
@@ -946,11 +1507,9 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op,
 //------------------------------------------------------------------------------
 // Single Operator Assembly Setup
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) {
+static int CeedOperatorAssembleSingleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) {
   Ceed                ceed;
   Ceed_Cuda          *cuda_data;
-  char               *assembly_kernel_source;
-  const char         *assembly_kernel_path;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
   CeedInt             elem_size_in, num_qpts_in = 0, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp;
   CeedEvalMode       *eval_modes_in = NULL, *eval_modes_out = NULL;
@@ -975,13 +1534,17 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
 
     CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedBasis    basis;
-      CeedEvalMode eval_mode;
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis));
       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
-      basis_in = basis;
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in));
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &elem_rstr));
+      if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in));
       if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in;
       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in));
@@ -996,6 +1559,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
         num_eval_modes_in += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
@@ -1005,14 +1569,18 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
 
     CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedBasis    basis;
-      CeedEvalMode eval_mode;
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis));
       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
                 "Backend does not implement operator assembly with multiple active bases");
-      basis_out = basis;
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out));
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &elem_rstr));
+      if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out));
       if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out;
       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out));
@@ -1029,6 +1597,7 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
         num_eval_modes_out += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
   CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
 
@@ -1047,20 +1616,16 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
   }
 
   // Compile kernels
+  const char assembly_kernel_source[] = "// Full assembly source\n#include <ceed/jit-source/cuda/cuda-ref-operator-assemble.h>\n";
+
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
                                    num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in,
                                    "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE",
                                    asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y,
                                    "USE_CEEDSIZE", use_ceedsize_idx));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble));
-  CeedCallBackend(CeedFree(&assembly_kernel_path));
-  CeedCallBackend(CeedFree(&assembly_kernel_source));
 
   // Load into B_in, in order that they will be used in eval_modes_in
   {
@@ -1093,11 +1658,9 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
       CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar),
                                     cudaMemcpyHostToDevice));
     }
-
-    if (identity) {
-      CeedCallBackend(CeedFree(&identity));
-    }
+    CeedCallBackend(CeedFree(&identity));
   }
+  CeedCallBackend(CeedFree(&eval_modes_in));
 
   // Load into B_out, in order that they will be used in eval_modes_out
   {
@@ -1130,11 +1693,15 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
       CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar),
                                     cudaMemcpyHostToDevice));
     }
-
-    if (identity) {
-      CeedCallBackend(CeedFree(&identity));
-    }
+    CeedCallBackend(CeedFree(&identity));
   }
+  CeedCallBackend(CeedFree(&eval_modes_out));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1142,11 +1709,11 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
 // Assemble matrix data for COO matrix of assembled operator.
 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
 //
-// Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval
-// modes).
+// Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator
+// (could have multiple basis eval modes).
 // TODO: allow multiple active input restrictions/basis objects
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) {
+static int CeedOperatorAssembleSingle_Cuda(CeedOperator op, CeedInt offset, CeedVector values) {
   Ceed                ceed;
   CeedSize            values_length = 0, assembled_qf_length = 0;
   CeedInt             use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out;
@@ -1172,7 +1739,7 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed
   if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
 
   // Setup
-  if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx));
+  if (!impl->asmb) CeedCallBackend(CeedOperatorAssembleSingleSetup_Cuda(op, use_ceedsize_idx));
   CeedOperatorAssemble_Cuda *asmb = impl->asmb;
 
   assert(asmb != NULL);
@@ -1218,8 +1785,8 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed
   void   *args[] = {(void *)&num_elem_in, &asmb->d_B_in,     &asmb->d_B_out,      &orients_in,  &curl_orients_in,
                     &orients_out,         &curl_orients_out, &assembled_qf_array, &values_array};
 
-  CeedCallBackend(
-      CeedRunKernelDimShared_Cuda(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args));
+  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, asmb->LinearAssemble, NULL, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block,
+                                              shared_mem, args));
 
   // Restore arrays
   CeedCallBackend(CeedVectorRestoreArray(values, &values_array));
@@ -1239,6 +1806,270 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed
       CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Assemble Linear QFunction AtPoints
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
+  return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Backend does not implement CeedOperatorLinearAssembleQFunction");
+}
+
+//------------------------------------------------------------------------------
+// Assemble Linear Diagonal AtPoints
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
+  CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
+  Ceed                ceed;
+  CeedVector          active_e_vec_in, active_e_vec_out;
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Cuda  *impl;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Setup
+  CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
+  num_points     = impl->num_points;
+  max_num_points = impl->max_num_points;
+
+  // Work vector
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_in));
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_out));
+  {
+    CeedSize length_in, length_out;
+
+    CeedCallBackend(CeedVectorGetLength(active_e_vec_in, &length_in));
+    CeedCallBackend(CeedVectorGetLength(active_e_vec_out, &length_out));
+    // Need input e_vec to be longer
+    if (length_in < length_out) {
+      CeedVector temp = active_e_vec_in;
+
+      active_e_vec_in  = active_e_vec_out;
+      active_e_vec_out = temp;
+    }
+  }
+
+  // Get point coordinates
+  {
+    CeedVector          point_coords = NULL;
+    CeedElemRestriction rstr_points  = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
+    if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
+    {
+      uint64_t state;
+      CeedCallBackend(CeedVectorGetState(point_coords, &state));
+      if (impl->points_state != state) {
+        CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+      }
+    }
+    CeedCallBackend(CeedVectorDestroy(&point_coords));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+
+  // Process inputs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestrict_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, false,
+                                                        impl));
+  }
+
+  // Output pointers, as necessary
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
+    }
+  }
+
+  // Loop over active fields
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool                is_active = false, is_active_at_points = true;
+    CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0, field_in = impl->input_field_order[i];
+    CeedRestrictionType rstr_type;
+    CeedVector          l_vec;
+    CeedElemRestriction elem_rstr;
+
+    // -- Skip non-active input
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[field_in], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
+    if (!is_active || impl->skip_rstr_in[field_in]) continue;
+
+    // -- Get active restriction type
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[field_in], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+    is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
+    if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    else elem_size = max_num_points;
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+
+    e_vec_size = elem_size * num_comp_active;
+    CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
+    for (CeedInt s = 0; s < e_vec_size; s++) {
+      CeedVector q_vec = impl->q_vecs_in[field_in];
+
+      // Update unit vector
+      {
+        // Note: E-vec strides are node * (1) + comp * (elem_size * num_elem) + elem * (elem_size)
+        CeedInt  node = (s - 1) % elem_size, comp = (s - 1) / elem_size;
+        CeedSize start = node * 1 + comp * (elem_size * num_elem);
+        CeedSize stop  = (comp + 1) * (elem_size * num_elem);
+
+        if (s != 0) CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 0.0));
+
+        node = s % elem_size, comp = s / elem_size;
+        start = node * 1 + comp * (elem_size * num_elem);
+        stop  = (comp + 1) * (elem_size * num_elem);
+        CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 1.0));
+      }
+
+      // Basis action
+      for (CeedInt j = 0; j < num_input_fields; j++) {
+        CeedInt field = impl->input_field_order[j];
+
+        CeedCallBackend(CeedOperatorInputBasisAtPoints_Cuda(op_input_fields[field], qf_input_fields[field], field, NULL, active_e_vec_in, num_elem,
+                                                            num_points, false, true, impl));
+      }
+
+      // Q function
+      CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
+
+      // Output basis apply if needed
+      for (CeedInt j = 0; j < num_output_fields; j++) {
+        bool                is_active = false;
+        CeedInt             elem_size = 0;
+        CeedInt             field_out = impl->output_field_order[j];
+        CeedRestrictionType rstr_type;
+        CeedEvalMode        eval_mode;
+        CeedVector          l_vec, e_vec = impl->e_vecs_out[field_out], q_vec = impl->q_vecs_out[field_out];
+        CeedElemRestriction elem_rstr;
+
+        // ---- Skip non-active output
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field_out], &l_vec));
+        is_active = l_vec == CEED_VECTOR_ACTIVE;
+        CeedCallBackend(CeedVectorDestroy(&l_vec));
+        if (!is_active) continue;
+        if (!e_vec) e_vec = active_e_vec_out;
+
+        // ---- Check if elem size matches
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field_out], &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+        if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue;
+        if (rstr_type == CEED_RESTRICTION_POINTS) {
+          CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &elem_size));
+        } else {
+          CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+        }
+        {
+          CeedInt num_comp = 0;
+
+          CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+          if (e_vec_size != num_comp * elem_size) continue;
+        }
+
+        // Basis action
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field_out], &eval_mode));
+        switch (eval_mode) {
+          case CEED_EVAL_NONE: {
+            CeedScalar *e_vec_array;
+
+            CeedCallBackend(CeedVectorTakeArray(q_vec, CEED_MEM_DEVICE, &e_vec_array));
+            CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
+            break;
+          }
+          case CEED_EVAL_INTERP:
+          case CEED_EVAL_GRAD:
+          case CEED_EVAL_DIV:
+          case CEED_EVAL_CURL: {
+            CeedBasis basis;
+
+            CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field_out], &basis));
+            if (impl->apply_add_basis_out[field_out]) {
+              CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec,
+                                                        e_vec));
+            } else {
+              CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+            }
+            CeedCallBackend(CeedBasisDestroy(&basis));
+            break;
+          }
+          // LCOV_EXCL_START
+          case CEED_EVAL_WEIGHT: {
+            return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+            // LCOV_EXCL_STOP
+          }
+        }
+
+        // Mask output e-vec
+        if (impl->skip_rstr_out[field_out]) {
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+          continue;
+        }
+        CeedCallBackend(CeedVectorPointwiseMult(e_vec, active_e_vec_in, e_vec));
+
+        // Restrict
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, assembled, request));
+        CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+
+        // Reset q_vec for
+        if (eval_mode == CEED_EVAL_NONE) {
+          CeedScalar *e_vec_array;
+
+          CeedCallBackend(CeedVectorGetArrayWrite(e_vec, CEED_MEM_DEVICE, &e_vec_array));
+          CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
+        }
+      }
+
+      // Reset vec
+      if (s == e_vec_size - 1 && i != num_input_fields - 1) CeedCallBackend(CeedVectorSetValue(q_vec, 0.0));
+    }
+  }
+
+  // Restore CEED_EVAL_NONE
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    // Get eval_mode
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+
+    // Restore evec
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &e_vec_array));
+    }
+  }
+
+  // Restore input arrays
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Cuda(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl));
+  }
+
+  // Restore work vector
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_in));
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_out));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1256,11 +2087,31 @@ int CeedOperatorCreate_Cuda(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Cuda));
-  CeedCallBackend(
-      CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal",
+                                         CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingle_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Create operator AtPoints
+//------------------------------------------------------------------------------
+int CeedOperatorCreateAtPoints_Cuda(CeedOperator op) {
+  Ceed               ceed;
+  CeedOperator_Cuda *impl;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedCalloc(1, &impl));
+  CeedCallBackend(CeedOperatorSetData(op, impl));
+
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunctionAtPoints_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
index 03ace250fb..82d21af0ac 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -24,42 +24,36 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) {
   using std::string;
 
   Ceed                ceed;
-  const char         *read_write_kernel_path, *read_write_kernel_source;
   CeedInt             num_input_fields, num_output_fields, size;
   CeedQFunctionField *input_fields, *output_fields;
   CeedQFunction_Cuda *data;
 
-  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
-  CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data));
-
   // QFunction is built
+  CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data));
   if (data->QFunction) return CEED_ERROR_SUCCESS;
 
-  CeedCheck(data->qfunction_source, ceed, CEED_ERROR_BACKEND, "No QFunction source or CUfunction provided.");
+  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
 
   // QFunction kernel generation
   CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
 
   // Build strings for final kernel
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-qfunction.h", &read_write_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n");
-  {
-    char *source;
-
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &source));
-    read_write_kernel_source = source;
-  }
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n");
-  string        qfunction_source(data->qfunction_source);
   string        qfunction_name(data->qfunction_name);
-  string        read_write(read_write_kernel_source);
   string        kernel_name = "CeedKernelCudaRefQFunction_" + qfunction_name;
   ostringstream code;
 
-  // Defintions
-  code << read_write;
-  code << qfunction_source;
-  code << "\n";
+  // Definitions
+  code << "// QFunction source\n";
+  code << "#include <ceed/jit-source/cuda/cuda-ref-qfunction.h>\n\n";
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_BACKEND, "No QFunction source or CUfunction provided.");
+
+    code << "// User QFunction source\n";
+    code << "#include \"" << source_path << "\"\n\n";
+  }
   code << "extern \"C\" __global__ void " << kernel_name << "(void *ctx, CeedInt Q, Fields_Cuda fields) {\n";
 
   // Inputs
@@ -69,7 +63,7 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) {
     code << "  const CeedInt size_input_" << i << " = " << size << ";\n";
     code << "  CeedScalar input_" << i << "[size_input_" << i << "];\n";
   }
-  code << "  const CeedScalar* inputs[" << num_input_fields << "];\n";
+  code << "  const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     code << "  inputs[" << i << "] = input_" << i << ";\n";
   }
@@ -82,7 +76,7 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) {
     code << "  const CeedInt size_output_" << i << " = " << size << ";\n";
     code << "  CeedScalar output_" << i << "[size_output_" << i << "];\n";
   }
-  code << "  CeedScalar* outputs[" << num_output_fields << "];\n";
+  code << "  CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     code << "  outputs[" << i << "] = output_" << i << ";\n";
   }
@@ -111,18 +105,10 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) {
   code << "  }\n";
   code << "}\n";
 
-  // View kernel for debugging
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated QFunction Kernels:\n");
-  CeedDebug(ceed, code.str().c_str());
-
   // Compile kernel
   CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 0));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, kernel_name.c_str(), &data->QFunction));
-
-  // Cleanup
-  CeedCallBackend(CeedFree(&data->qfunction_source));
-  CeedCallBackend(CeedFree(&read_write_kernel_path));
-  CeedCallBackend(CeedFree(&read_write_kernel_source));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h
index d8ca4f175b..360b8b9673 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction.c b/backends/cuda-ref/ceed-cuda-ref-qfunction.c
index f52aebb685..ded455665b 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -58,6 +58,7 @@ static int CeedQFunctionApply_Cuda(CeedQFunction qf, CeedInt Q, CeedVector *U, C
 
   // Restore context
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &data->d_c));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -95,16 +96,13 @@ int CeedQFunctionCreate_Cuda(CeedQFunction qf) {
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedQFunctionSetData(qf, data));
 
-  // Read QFunction source
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
-  CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "SetCUDAUserFunction", CeedQFunctionSetCUDAUserFunction_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
index 4257265987..491e658338 100644
--- a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
+++ b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -37,6 +37,7 @@ static inline int CeedQFunctionContextSyncH2D_Cuda(const CeedQFunctionContext ct
     impl->d_data = impl->d_data_owned;
   }
   CeedCallCuda(ceed, cudaMemcpy(impl->d_data, impl->h_data, ctx_size, cudaMemcpyHostToDevice));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -64,6 +65,7 @@ static inline int CeedQFunctionContextSyncD2H_Cuda(const CeedQFunctionContext ct
     impl->h_data = impl->h_data_owned;
   }
   CeedCallCuda(ceed, cudaMemcpy(impl->h_data, impl->d_data, ctx_size, cudaMemcpyDeviceToHost));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -77,7 +79,9 @@ static inline int CeedQFunctionContextSync_Cuda(const CeedQFunctionContext ctx,
     case CEED_MEM_DEVICE:
       return CeedQFunctionContextSyncH2D_Cuda(ctx);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -205,6 +209,7 @@ static int CeedQFunctionContextSetDataDevice_Cuda(const CeedQFunctionContext ctx
       impl->d_data          = data;
       break;
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -220,7 +225,9 @@ static int CeedQFunctionContextSetData_Cuda(const CeedQFunctionContext ctx, cons
     case CEED_MEM_DEVICE:
       return CeedQFunctionContextSetDataDevice_Cuda(ctx, copy_mode, data);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -335,6 +342,7 @@ int CeedQFunctionContextCreate_Cuda(CeedQFunctionContext ctx) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index f253b5413d..f390ec0b4c 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -24,36 +24,34 @@
 static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr) {
   Ceed                      ceed;
   bool                      is_deterministic;
-  char                     *restriction_kernel_source;
-  const char               *restriction_kernel_path;
   CeedInt                   num_elem, num_comp, elem_size, comp_stride;
   CeedRestrictionType       rstr_type;
   CeedElemRestriction_Cuda *impl;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
-  CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
-  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr, &elem_size));
+  } else {
+    CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
+  }
   is_deterministic = impl->d_l_vec_indices != NULL;
 
   // Compile CUDA kernels
   switch (rstr_type) {
     case CEED_RESTRICTION_STRIDED: {
-      bool    has_backend_strides;
-      CeedInt strides[3] = {1, num_elem * elem_size, elem_size};
+      const char restriction_kernel_source[] = "// Strided restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-strided.h>\n";
+      bool       has_backend_strides;
+      CeedInt    strides[3] = {1, num_elem * elem_size, elem_size};
 
       CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
       if (!has_backend_strides) {
         CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
       }
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-strided.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
       CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                        "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM",
                                        strides[2]));
@@ -61,27 +59,30 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose));
     } break;
     case CEED_RESTRICTION_STANDARD: {
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      const char restriction_kernel_source[] = "// Standard restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-offset.h>\n";
+
       CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                        "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                        "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose));
     } break;
+    case CEED_RESTRICTION_POINTS: {
+      const char restriction_kernel_source[] =
+          "// AtPoints restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-at-points.h>\n\n"
+          "// Standard restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-offset.h>\n";
+
+      CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
+                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
+                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose));
+    } break;
     case CEED_RESTRICTION_ORIENTED: {
-      const char *offset_kernel_path;
-      char      **file_paths     = NULL;
-      CeedInt     num_file_paths = 0;
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-oriented.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path));
-      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      const char restriction_kernel_source[] =
+          "// Oriented restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-oriented.h>\n\n"
+          "// Standard restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-offset.h>\n";
+
       CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                        "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                        "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
@@ -89,22 +90,11 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose));
-      // Cleanup
-      CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
-      CeedCall(CeedFree(&file_paths));
     } break;
     case CEED_RESTRICTION_CURL_ORIENTED: {
-      const char *offset_kernel_path;
-      char      **file_paths     = NULL;
-      CeedInt     num_file_paths = 0;
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path));
-      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      const char restriction_kernel_source[] =
+          "// Curl oriented restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h>\n\n"
+          "// Standard restriction source\n#include <ceed/jit-source/cuda/cuda-ref-restriction-offset.h>\n";
       CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                        "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                        "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
@@ -114,19 +104,9 @@ static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose));
       CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose));
-      // Cleanup
-      CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
-      CeedCall(CeedFree(&file_paths));
-    } break;
-    case CEED_RESTRICTION_POINTS: {
-      // LCOV_EXCL_START
-      return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
-      // LCOV_EXCL_STOP
     } break;
   }
-  CeedCallBackend(CeedFree(&restriction_kernel_path));
-  CeedCallBackend(CeedFree(&restriction_kernel_source));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -175,6 +155,7 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
 
         CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args));
       } break;
+      case CEED_RESTRICTION_POINTS:
       case CEED_RESTRICTION_STANDARD: {
         void *args[] = {&impl->d_offsets, &d_u, &d_v};
 
@@ -206,11 +187,6 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
           CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args));
         }
       } break;
-      case CEED_RESTRICTION_POINTS: {
-        // LCOV_EXCL_START
-        return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
-        // LCOV_EXCL_STOP
-      } break;
     }
   } else {
     // E-vector -> L-vector
@@ -224,6 +200,17 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
 
         CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
       } break;
+      case CEED_RESTRICTION_POINTS: {
+        if (!is_deterministic) {
+          void *args[] = {&impl->d_offsets, &impl->d_points_per_elem, &d_u, &d_v};
+
+          CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
+        } else {
+          void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_points_per_elem, &impl->d_t_offsets, &d_u, &d_v};
+
+          CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
+        }
+      } break;
       case CEED_RESTRICTION_STANDARD: {
         if (!is_deterministic) {
           void *args[] = {&impl->d_offsets, &d_u, &d_v};
@@ -291,11 +278,6 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
           }
         }
       } break;
-      case CEED_RESTRICTION_POINTS: {
-        // LCOV_EXCL_START
-        return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
-        // LCOV_EXCL_STOP
-      } break;
     }
   }
 
@@ -304,6 +286,7 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
   // Restore arrays
   CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -335,14 +318,16 @@ static int CeedElemRestrictionApplyUnoriented_Cuda(CeedElemRestriction rstr, Cee
 //------------------------------------------------------------------------------
 static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) {
   CeedElemRestriction_Cuda *impl;
+  CeedRestrictionType       rstr_type;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   switch (mem_type) {
     case CEED_MEM_HOST:
-      *offsets = impl->h_offsets;
+      *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->h_offsets_at_points : impl->h_offsets;
       break;
     case CEED_MEM_DEVICE:
-      *offsets = impl->d_offsets;
+      *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->d_offsets_at_points : impl->d_offsets;
       break;
   }
   return CEED_ERROR_SUCCESS;
@@ -384,6 +369,17 @@ static int CeedElemRestrictionGetCurlOrientations_Cuda(CeedElemRestriction rstr,
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Get offset for padded AtPoints E-layout
+//------------------------------------------------------------------------------
+static int CeedElemRestrictionGetAtPointsElementOffset_Cuda(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset) {
+  CeedInt layout[3];
+
+  CeedCallBackend(CeedElemRestrictionGetELayout(rstr, layout));
+  *elem_offset = 0 * layout[0] + 0 * layout[1] + elem * layout[2];
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy restriction
 //------------------------------------------------------------------------------
@@ -405,25 +401,31 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) {
   CeedCallCuda(ceed, cudaFree((bool *)impl->d_orients_owned));
   CeedCallBackend(CeedFree(&impl->h_curl_orients_owned));
   CeedCallCuda(ceed, cudaFree((CeedInt8 *)impl->d_curl_orients_owned));
+  CeedCallBackend(CeedFree(&impl->h_offsets_at_points_owned));
+  CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_offsets_at_points_owned));
+  CeedCallBackend(CeedFree(&impl->h_points_per_elem_owned));
+  CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_points_per_elem_owned));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Create transpose offsets and indices
 //------------------------------------------------------------------------------
-static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt *indices) {
+static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt elem_size, const CeedInt *indices) {
   Ceed                      ceed;
   bool                     *is_node;
   CeedSize                  l_size;
-  CeedInt                   num_elem, elem_size, num_comp, num_nodes = 0;
+  CeedInt                   num_elem, num_comp, num_nodes = 0;
   CeedInt                  *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices;
+  CeedRestrictionType       rstr_type;
   CeedElemRestriction_Cuda *impl;
 
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
   const CeedInt size_indices = num_elem * elem_size;
@@ -486,6 +488,7 @@ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const
   CeedCallBackend(CeedFree(&l_vec_indices));
   CeedCallBackend(CeedFree(&t_offsets));
   CeedCallBackend(CeedFree(&t_indices));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -496,16 +499,27 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
                                    const CeedInt8 *curl_orients, CeedElemRestriction rstr) {
   Ceed                      ceed, ceed_parent;
   bool                      is_deterministic;
-  CeedInt                   num_elem, elem_size;
+  CeedInt                   num_elem, num_comp, elem_size;
   CeedRestrictionType       rstr_type;
   CeedElemRestriction_Cuda *impl;
 
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
   CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
+  CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
+  // Use max number of points as elem size for AtPoints restrictions
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedInt max_points = 0;
+
+    for (CeedInt i = 0; i < num_elem; i++) {
+      max_points = CeedIntMax(max_points, offsets[i + 1] - offsets[i]);
+    }
+    elem_size = max_points;
+  }
   const CeedInt size = num_elem * elem_size;
 
   CeedCallBackend(CeedCalloc(1, &impl));
@@ -526,6 +540,51 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
     }
   }
 
+  // Pad AtPoints indices
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedSize offsets_len = elem_size * num_elem, at_points_size = num_elem + 1;
+    CeedInt  max_points = elem_size, *offsets_padded, *points_per_elem;
+
+    CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "only MemType Host supported when creating AtPoints restriction");
+    CeedCallBackend(CeedMalloc(offsets_len, &offsets_padded));
+    CeedCallBackend(CeedMalloc(num_elem, &points_per_elem));
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points = offsets[i + 1] - offsets[i];
+      CeedInt last_point = 0;
+
+      points_per_elem[i] = num_points;
+      at_points_size += num_points;
+      // -- Copy all points in element
+      for (CeedInt j = 0; j < num_points; j++) {
+        offsets_padded[i * max_points + j] = offsets[offsets[i] + j] * num_comp;
+        last_point                         = offsets_padded[i * max_points + j];
+      }
+      // -- Replicate out last point in element
+      for (CeedInt j = num_points; j < max_points; j++) {
+        offsets_padded[i * max_points + j] = last_point;
+      }
+    }
+    CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, at_points_size, &impl->h_offsets_at_points_owned, &impl->h_offsets_at_points_borrowed,
+                                            &impl->h_offsets_at_points));
+    CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_offsets_at_points_owned, at_points_size * sizeof(CeedInt)));
+    CeedCallCuda(ceed, cudaMemcpy((CeedInt **)impl->d_offsets_at_points_owned, impl->h_offsets_at_points, at_points_size * sizeof(CeedInt),
+                                  cudaMemcpyHostToDevice));
+    impl->d_offsets_at_points = (CeedInt *)impl->d_offsets_at_points_owned;
+
+    // -- Use padded offsets for the rest of the setup
+    offsets   = (const CeedInt *)offsets_padded;
+    copy_mode = CEED_OWN_POINTER;
+    CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, elem_size * num_elem * num_comp));
+
+    // -- Points per element
+    CeedCallBackend(CeedSetHostCeedIntArray(points_per_elem, CEED_OWN_POINTER, num_elem, &impl->h_points_per_elem_owned,
+                                            &impl->h_points_per_elem_borrowed, &impl->h_points_per_elem));
+    CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_points_per_elem_owned, num_elem * sizeof(CeedInt)));
+    CeedCallCuda(ceed,
+                 cudaMemcpy((CeedInt **)impl->d_points_per_elem_owned, impl->h_points_per_elem, num_elem * sizeof(CeedInt), cudaMemcpyHostToDevice));
+    impl->d_points_per_elem = (CeedInt *)impl->d_points_per_elem_owned;
+  }
+
   // Set up device offset/orientation arrays
   if (rstr_type != CEED_RESTRICTION_STRIDED) {
     switch (mem_type) {
@@ -534,7 +593,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
         CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_offsets_owned, size * sizeof(CeedInt)));
         CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_offsets_owned, impl->h_offsets, size * sizeof(CeedInt), cudaMemcpyHostToDevice));
         impl->d_offsets = (CeedInt *)impl->d_offsets_owned;
-        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, offsets));
+        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, elem_size, offsets));
       } break;
       case CEED_MEM_DEVICE: {
         CeedCallBackend(CeedSetDeviceCeedIntArray_Cuda(ceed, offsets, copy_mode, size, &impl->d_offsets_owned, &impl->d_offsets_borrowed,
@@ -542,7 +601,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
         CeedCallBackend(CeedMalloc(size, &impl->h_offsets_owned));
         CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->h_offsets_owned, impl->d_offsets, size * sizeof(CeedInt), cudaMemcpyDeviceToHost));
         impl->h_offsets = impl->h_offsets_owned;
-        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, offsets));
+        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, elem_size, offsets));
       } break;
     }
 
@@ -592,7 +651,12 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Cuda));
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset",
+                                           CeedElemRestrictionGetAtPointsElementOffset_Cuda));
+  }
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c
index 2759b38a4c..b0489d36d6 100644
--- a/backends/cuda-ref/ceed-cuda-ref-vector.c
+++ b/backends/cuda-ref/ceed-cuda-ref-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -41,10 +41,8 @@ static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, CeedMemType mem_
 static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) {
   CeedSize         length;
   size_t           bytes;
-  Ceed             ceed;
   CeedVector_Cuda *impl;
 
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
 
   CeedCheck(impl->h_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid host data to sync to device");
@@ -56,10 +54,10 @@ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) {
   } else if (impl->d_array_owned) {
     impl->d_array = impl->d_array_owned;
   } else {
-    CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes));
+    CeedCallCuda(CeedVectorReturnCeed(vec), cudaMalloc((void **)&impl->d_array_owned, bytes));
     impl->d_array = impl->d_array_owned;
   }
-  CeedCallCuda(ceed, cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice));
+  CeedCallCuda(CeedVectorReturnCeed(vec), cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -68,13 +66,11 @@ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) {
 //------------------------------------------------------------------------------
 static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) {
   CeedSize         length;
-  Ceed             ceed;
   CeedVector_Cuda *impl;
 
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
 
-  CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host");
+  CeedCheck(impl->d_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid device data to sync to host");
 
   if (impl->h_array_borrowed) {
     impl->h_array = impl->h_array_borrowed;
@@ -91,7 +87,7 @@ static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) {
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   size_t bytes = length * sizeof(CeedScalar);
 
-  CeedCallCuda(ceed, cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost));
+  CeedCallCuda(CeedVectorReturnCeed(vec), cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -111,7 +107,9 @@ static int CeedVectorSyncArray_Cuda(const CeedVector vec, CeedMemType mem_type)
     case CEED_MEM_DEVICE:
       return CeedVectorSyncH2D_Cuda(vec);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -202,6 +200,7 @@ static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, const CeedCopyMod
 
   CeedCallBackend(CeedSetDeviceCeedScalarArray_Cuda(ceed, array, copy_mode, length, (const CeedScalar **)&impl->d_array_owned,
                                                     (const CeedScalar **)&impl->d_array_borrowed, (const CeedScalar **)&impl->d_array));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -220,7 +219,73 @@ static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_t
     case CEED_MEM_DEVICE:
       return CeedVectorSetArrayDevice_Cuda(vec, copy_mode, array);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
+}
+
+//------------------------------------------------------------------------------
+// Copy host array to value strided
+//------------------------------------------------------------------------------
+static int CeedHostCopyStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *h_copy_array) {
+  for (CeedSize i = start; i < stop; i += step) h_copy_array[i] = h_array[i];
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Copy device array to value strided (impl in .cu file)
+//------------------------------------------------------------------------------
+int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *d_copy_array);
+
+//------------------------------------------------------------------------------
+// Copy a vector to a value strided
+//------------------------------------------------------------------------------
+static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedVector vec_copy) {
+  CeedSize         length;
+  CeedVector_Cuda *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  {
+    CeedSize length_vec, length_copy;
+
+    CeedCallBackend(CeedVectorGetLength(vec, &length_vec));
+    CeedCallBackend(CeedVectorGetLength(vec_copy, &length_copy));
+    length = length_vec < length_copy ? length_vec : length_copy;
+  }
+  if (stop == -1) stop = length;
+  // Set value for synced device/host array
+  if (impl->d_array) {
+    CeedScalar *copy_array;
+
+    CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, &copy_array));
+#if (CUDA_VERSION >= 12000)
+    cublasHandle_t handle;
+    Ceed           ceed;
+
+    CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+    CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle));
+#if defined(CEED_SCALAR_IS_FP32)
+    CeedCallCublas(ceed, cublasScopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
+#else  /* CEED_SCALAR */
+    CeedCallCublas(ceed, cublasDcopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
+#endif /* CEED_SCALAR */
+    CeedCallBackend(CeedDestroy(&ceed));
+#else  /* CUDA_VERSION */
+    CeedCallBackend(CeedDeviceCopyStrided_Cuda(impl->d_array, start, stop, step, copy_array));
+#endif /* CUDA_VERSION */
+    CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
+    impl->h_array = NULL;
+  } else if (impl->h_array) {
+    CeedScalar *copy_array;
+
+    CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &copy_array));
+    CeedCallBackend(CeedHostCopyStrided_Cuda(impl->h_array, start, stop, step, copy_array));
+    CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
+    impl->d_array = NULL;
+  } else {
+    return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
+  }
+  return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
@@ -260,16 +325,55 @@ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) {
     }
   }
   if (impl->d_array) {
-    CeedCallBackend(CeedDeviceSetValue_Cuda(impl->d_array, length, val));
+    if (val == 0) {
+      CeedCallCuda(CeedVectorReturnCeed(vec), cudaMemset(impl->d_array, 0, length * sizeof(CeedScalar)));
+    } else {
+      CeedCallBackend(CeedDeviceSetValue_Cuda(impl->d_array, length, val));
+    }
     impl->h_array = NULL;
-  }
-  if (impl->h_array) {
+  } else if (impl->h_array) {
     CeedCallBackend(CeedHostSetValue_Cuda(impl->h_array, length, val));
     impl->d_array = NULL;
   }
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Set host array to value strided
+//------------------------------------------------------------------------------
+static int CeedHostSetValueStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
+  for (CeedSize i = start; i < stop; i += step) h_array[i] = val;
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Set device array to value strided (impl in .cu file)
+//------------------------------------------------------------------------------
+int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val);
+
+//------------------------------------------------------------------------------
+// Set a vector to a value strided
+//------------------------------------------------------------------------------
+static int CeedVectorSetValueStrided_Cuda(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
+  CeedSize         length;
+  CeedVector_Cuda *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+  // Set value for synced device/host array
+  if (stop == -1) stop = length;
+  if (impl->d_array) {
+    CeedCallBackend(CeedDeviceSetValueStrided_Cuda(impl->d_array, start, stop, step, val));
+    impl->h_array = NULL;
+  } else if (impl->h_array) {
+    CeedCallBackend(CeedHostSetValueStrided_Cuda(impl->h_array, start, stop, step, val));
+    impl->d_array = NULL;
+  } else {
+    return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Vector Take Array
 //------------------------------------------------------------------------------
@@ -377,9 +481,9 @@ static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType
 static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *norm) {
   Ceed     ceed;
   CeedSize length;
-#if CUDA_VERSION < 12000
+#if (CUDA_VERSION < 12000)
   CeedSize num_calls;
-#endif
+#endif /* CUDA_VERSION */
   const CeedScalar *d_array;
   CeedVector_Cuda  *impl;
   cublasHandle_t    handle;
@@ -389,146 +493,147 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle));
 
-#if CUDA_VERSION < 12000
+#if (CUDA_VERSION < 12000)
   // With CUDA 12, we can use the 64-bit integer interface. Prior to that,
   // we need to check if the vector is too long to handle with int32,
   // and if so, divide it into subsections for repeated cuBLAS calls.
   num_calls = length / INT_MAX;
   if (length % INT_MAX > 0) num_calls += 1;
-#endif
+#endif /* CUDA_VERSION */
 
   // Compute norm
   CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array));
   switch (type) {
     case CEED_NORM_1: {
       *norm = 0.0;
-      if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-#if CUDA_VERSION >= 12000  // We have CUDA 12, and can use 64-bit integers
-        CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
-#else
-        float  sub_norm = 0.0;
-        float *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
-          *norm += sub_norm;
-        }
-#endif
-      } else {
-#if CUDA_VERSION >= 12000
-        CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
-#else
-        double  sub_norm = 0.0;
-        double *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
-          *norm += sub_norm;
-        }
-#endif
+#if defined(CEED_SCALAR_IS_FP32)
+#if (CUDA_VERSION >= 12000)  // We have CUDA 12, and can use 64-bit integers
+      CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
+#else  /* CUDA_VERSION */
+      float  sub_norm = 0.0;
+      float *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
+        *norm += sub_norm;
       }
+#endif /* CUDA_VERSION */
+#else  /* CEED_SCALAR */
+#if (CUDA_VERSION >= 12000)
+      CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
+#else  /* CUDA_VERSION */
+      double  sub_norm = 0.0;
+      double *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
+        *norm += sub_norm;
+      }
+#endif /* CUDA_VERSION */
+#endif /* CEED_SCALAR */
       break;
     }
     case CEED_NORM_2: {
-      if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-#if CUDA_VERSION >= 12000
-        CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
-#else
-        float  sub_norm = 0.0, norm_sum = 0.0;
-        float *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
-          norm_sum += sub_norm * sub_norm;
-        }
-        *norm = sqrt(norm_sum);
-#endif
-      } else {
-#if CUDA_VERSION >= 12000
-        CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
-#else
-        double  sub_norm = 0.0, norm_sum = 0.0;
-        double *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
-          norm_sum += sub_norm * sub_norm;
-        }
-        *norm = sqrt(norm_sum);
-#endif
+#if defined(CEED_SCALAR_IS_FP32)
+#if (CUDA_VERSION >= 12000)
+      CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
+#else  /* CUDA_VERSION */
+      float  sub_norm = 0.0, norm_sum = 0.0;
+      float *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
+        norm_sum += sub_norm * sub_norm;
+      }
+      *norm = sqrt(norm_sum);
+#endif /* CUDA_VERSION */
+#else  /* CEED_SCALAR */
+#if (CUDA_VERSION >= 12000)
+      CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
+#else  /* CUDA_VERSION */
+      double  sub_norm = 0.0, norm_sum = 0.0;
+      double *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
+        norm_sum += sub_norm * sub_norm;
       }
+      *norm = sqrt(norm_sum);
+#endif /* CUDA_VERSION */
+#endif /* CEED_SCALAR */
       break;
     }
     case CEED_NORM_MAX: {
-      if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-#if CUDA_VERSION >= 12000
-        int64_t    index;
-        CeedScalar norm_no_abs;
-
-        CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
-        CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
-        *norm = fabs(norm_no_abs);
-#else
-        CeedInt index;
-        float   sub_max = 0.0, current_max = 0.0;
-        float  *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
-          CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
-          if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
-        }
-        *norm = current_max;
-#endif
-      } else {
-#if CUDA_VERSION >= 12000
-        int64_t    index;
-        CeedScalar norm_no_abs;
-
-        CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
-        CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
-        *norm = fabs(norm_no_abs);
-#else
-        CeedInt index;
-        double  sub_max = 0.0, current_max = 0.0;
-        double *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
-          CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
-          if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
-        }
-        *norm = current_max;
-#endif
+#if defined(CEED_SCALAR_IS_FP32)
+#if (CUDA_VERSION >= 12000)
+      int64_t    index;
+      CeedScalar norm_no_abs;
+
+      CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
+      CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
+      *norm = fabs(norm_no_abs);
+#else  /* CUDA_VERSION */
+      CeedInt index;
+      float   sub_max = 0.0, current_max = 0.0;
+      float  *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
+        CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
+        if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
       }
+      *norm = current_max;
+#endif /* CUDA_VERSION */
+#else  /* CEED_SCALAR */
+#if (CUDA_VERSION >= 12000)
+      int64_t    index;
+      CeedScalar norm_no_abs;
+
+      CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
+      CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
+      *norm = fabs(norm_no_abs);
+#else  /* CUDA_VERSION */
+      CeedInt index;
+      double  sub_max = 0.0, current_max = 0.0;
+      double *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
+        CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
+        if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
+      }
+      *norm = current_max;
+#endif /* CUDA_VERSION */
+#endif /* CEED_SCALAR */
       break;
     }
   }
   CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -580,13 +685,29 @@ int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length)
 //------------------------------------------------------------------------------
 static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) {
   CeedSize         length;
-  CeedVector_Cuda *x_impl;
+  CeedVector_Cuda *impl;
 
-  CeedCallBackend(CeedVectorGetData(x, &x_impl));
+  CeedCallBackend(CeedVectorGetData(x, &impl));
   CeedCallBackend(CeedVectorGetLength(x, &length));
   // Set value for synced device/host array
-  if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Cuda(x_impl->d_array, alpha, length));
-  if (x_impl->h_array) CeedCallBackend(CeedHostScale_Cuda(x_impl->h_array, alpha, length));
+  if (impl->d_array) {
+#if (CUDA_VERSION >= 12000)
+    cublasHandle_t handle;
+
+    CeedCallBackend(CeedGetCublasHandle_Cuda(CeedVectorReturnCeed(x), &handle));
+#if defined(CEED_SCALAR_IS_FP32)
+    CeedCallCublas(CeedVectorReturnCeed(x), cublasSscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
+#else  /* CEED_SCALAR */
+    CeedCallCublas(CeedVectorReturnCeed(x), cublasDscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
+#endif /* CEED_SCALAR */
+#else  /* CUDA_VERSION */
+    CeedCallBackend(CeedDeviceScale_Cuda(impl->d_array, alpha, length));
+#endif /* CUDA_VERSION */
+    impl->h_array = NULL;
+  } else if (impl->h_array) {
+    CeedCallBackend(CeedHostScale_Cuda(impl->h_array, alpha, length));
+    impl->d_array = NULL;
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -607,22 +728,32 @@ int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_arr
 // Compute y = alpha x + y
 //------------------------------------------------------------------------------
 static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) {
-  Ceed             ceed;
   CeedSize         length;
   CeedVector_Cuda *y_impl, *x_impl;
 
-  CeedCallBackend(CeedVectorGetCeed(y, &ceed));
   CeedCallBackend(CeedVectorGetData(y, &y_impl));
   CeedCallBackend(CeedVectorGetData(x, &x_impl));
   CeedCallBackend(CeedVectorGetLength(y, &length));
   // Set value for synced device/host array
   if (y_impl->d_array) {
     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
+#if (CUDA_VERSION >= 12000)
+    cublasHandle_t handle;
+
+    CeedCallBackend(CeedGetCublasHandle_Cuda(CeedVectorReturnCeed(y), &handle));
+#if defined(CEED_SCALAR_IS_FP32)
+    CeedCallCublas(CeedVectorReturnCeed(y), cublasSaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
+#else  /* CEED_SCALAR */
+    CeedCallCublas(CeedVectorReturnCeed(y), cublasDaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
+#endif /* CEED_SCALAR */
+#else  /* CUDA_VERSION */
     CeedCallBackend(CeedDeviceAXPY_Cuda(y_impl->d_array, alpha, x_impl->d_array, length));
-  }
-  if (y_impl->h_array) {
+#endif /* CUDA_VERSION */
+    y_impl->h_array = NULL;
+  } else if (y_impl->h_array) {
     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
     CeedCallBackend(CeedHostAXPY_Cuda(y_impl->h_array, alpha, x_impl->h_array, length));
+    y_impl->d_array = NULL;
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -728,18 +859,21 @@ int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Cuda));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "CopyStrided", CeedVectorCopyStrided_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", CeedVectorSetValue_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", CeedVectorSetValueStrided_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Cuda));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())CeedVectorScale_Cuda));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())CeedVectorAXPY_Cuda));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", CeedVectorScale_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", CeedVectorAXPY_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Cuda));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedVectorSetData(vec, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/cuda-ref/ceed-cuda-ref.c b/backends/cuda-ref/ceed-cuda-ref.c
index bbfa8cf875..0937b0ce17 100644
--- a/backends/cuda-ref/ceed-cuda-ref.c
+++ b/backends/cuda-ref/ceed-cuda-ref.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -57,9 +57,11 @@ static int CeedInit_Cuda_ref(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreateAtPoints", CeedElemRestrictionCreate_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreateAtPoints_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 349aa8ef3a..337e7c92a0 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -48,6 +48,18 @@ typedef struct {
   const CeedInt8 *d_curl_orients;
   const CeedInt8 *d_curl_orients_borrowed;
   const CeedInt8 *d_curl_orients_owned;
+  const CeedInt  *h_offsets_at_points;
+  const CeedInt  *h_offsets_at_points_borrowed;
+  const CeedInt  *h_offsets_at_points_owned;
+  const CeedInt  *d_offsets_at_points;
+  const CeedInt  *d_offsets_at_points_borrowed;
+  const CeedInt  *d_offsets_at_points_owned;
+  const CeedInt  *h_points_per_elem;
+  const CeedInt  *h_points_per_elem_borrowed;
+  const CeedInt  *h_points_per_elem_owned;
+  const CeedInt  *d_points_per_elem;
+  const CeedInt  *d_points_per_elem_borrowed;
+  const CeedInt  *d_points_per_elem_owned;
 } CeedElemRestriction_Cuda;
 
 typedef struct {
@@ -55,9 +67,19 @@ typedef struct {
   CUfunction  Interp;
   CUfunction  Grad;
   CUfunction  Weight;
+  CUmodule    moduleAtPoints;
+  CeedInt     num_points;
+  CUfunction  InterpAtPoints;
+  CUfunction  InterpTransposeAtPoints;
+  CUfunction  GradAtPoints;
+  CUfunction  GradTransposeAtPoints;
   CeedScalar *d_interp_1d;
   CeedScalar *d_grad_1d;
   CeedScalar *d_q_weight_1d;
+  CeedScalar *d_chebyshev_interp_1d;
+  CeedInt     num_elem_at_points;
+  CeedInt    *h_points_per_elem;
+  CeedInt    *d_points_per_elem;
 } CeedBasis_Cuda;
 
 typedef struct {
@@ -77,7 +99,6 @@ typedef struct {
 typedef struct {
   CUmodule    module;
   const char *qfunction_name;
-  const char *qfunction_source;
   CUfunction  QFunction;
   Fields_Cuda fields;
   void       *d_c;
@@ -111,12 +132,17 @@ typedef struct {
 } CeedOperatorAssemble_Cuda;
 
 typedef struct {
-  CeedVector                *e_vecs;      // E-vectors, inputs followed by outputs
-  CeedVector                *q_vecs_in;   // Input Q-vectors needed to apply operator
-  CeedVector                *q_vecs_out;  // Output Q-vectors needed to apply operator
+  bool                      *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
+  uint64_t                  *input_states, points_state;  // State tracking for passive inputs
+  CeedVector                *e_vecs_in, *e_vecs_out;
+  CeedVector                *q_vecs_in, *q_vecs_out;
   CeedInt                    num_inputs, num_outputs;
   CeedInt                    num_active_in, num_active_out;
-  CeedVector                *qf_active_in;
+  CeedInt                   *input_field_order, *output_field_order;
+  CeedSize                   max_active_e_vec_len;
+  CeedInt                    max_num_points;
+  CeedInt                   *num_points;
+  CeedVector                *qf_active_in, point_coords_elem;
   CeedOperatorDiag_Cuda     *diag;
   CeedOperatorAssemble_Cuda *asmb;
 } CeedOperator_Cuda;
@@ -142,3 +168,4 @@ CEED_INTERN int CeedQFunctionCreate_Cuda(CeedQFunction qf);
 CEED_INTERN int CeedQFunctionContextCreate_Cuda(CeedQFunctionContext ctx);
 
 CEED_INTERN int CeedOperatorCreate_Cuda(CeedOperator op);
+CEED_INTERN int CeedOperatorCreateAtPoints_Cuda(CeedOperator op);
diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu
index 51c5565308..cae3bad181 100644
--- a/backends/cuda-ref/kernels/cuda-ref-vector.cu
+++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,159 +8,188 @@
 #include <ceed.h>
 #include <cuda.h>
 
+//------------------------------------------------------------------------------
+// Kernel for copy strided on device
+//------------------------------------------------------------------------------
+__global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *__restrict__ vec_copy) {
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < stop - start) {
+    if (index % step == 0) vec_copy[start + index] = vec[start + index];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Copy strided on device memory
+//------------------------------------------------------------------------------
+extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *d_copy_array) {
+  const int      block_size = 512;
+  const CeedSize copy_size  = stop - start;
+  int            grid_size  = copy_size / block_size;
+
+  if (block_size * grid_size < copy_size) grid_size += 1;
+  copyStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, d_copy_array);
+  return 0;
+}
+
 //------------------------------------------------------------------------------
 // Kernel for set value on device
 //------------------------------------------------------------------------------
-__global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size,
-                                 CeedScalar val) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size)
-    return;
-  vec[index] = val;
+__global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) {
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) vec[index] = val;
 }
 
 //------------------------------------------------------------------------------
 // Set value on device memory
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length,
-                                       CeedScalar val) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
+
+  if (block_size * grid_size < vec_size) grid_size += 1;
+  setValueK<<<grid_size, block_size>>>(d_array, length, val);
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+// Kernel for set value strided on device
+//------------------------------------------------------------------------------
+__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < stop - start) {
+    if (index % step == 0) vec[start + index] = val;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Set value strided on device memory
+//------------------------------------------------------------------------------
+extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
+  const int      block_size = 512;
+  const CeedSize set_size   = stop - start;
+  int            grid_size  = set_size / block_size;
 
-  if (block_size * grid_size < vec_size)
-    grid_size += 1;
-  setValueK<<<grid_size,block_size>>>(d_array, length, val);
+  if (block_size * grid_size < set_size) grid_size += 1;
+  setValueStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, val);
   return 0;
 }
 
 //------------------------------------------------------------------------------
 // Kernel for taking reciprocal
 //------------------------------------------------------------------------------
-__global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size)
-    return;
-  if (fabs(vec[index]) > 1E-16)
-    vec[index] = 1./vec[index];
+__global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) {
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) {
+    if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
+  }
 }
 
 //------------------------------------------------------------------------------
 // Take vector reciprocal in device memory
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
-  if (block_size * grid_size < vec_size)
-    grid_size += 1;
-  rcpValueK<<<grid_size,block_size>>>(d_array, length);
+  if (block_size * grid_size < vec_size) grid_size += 1;
+  rcpValueK<<<grid_size, block_size>>>(d_array, length);
   return 0;
 }
 
 //------------------------------------------------------------------------------
 // Kernel for scale
 //------------------------------------------------------------------------------
-__global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha,
-    CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size)
-    return;
-  x[index] *= alpha;
+__global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) {
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) x[index] *= alpha;
 }
 
 //------------------------------------------------------------------------------
 // Compute x = alpha x on device
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha,
-    CeedSize length) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
-  if (block_size * grid_size < vec_size)
-    grid_size += 1;
-  scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length);
+  if (block_size * grid_size < vec_size) grid_size += 1;
+  scaleValueK<<<grid_size, block_size>>>(x_array, alpha, length);
   return 0;
 }
 
 //------------------------------------------------------------------------------
 // Kernel for axpy
 //------------------------------------------------------------------------------
-__global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha,
-    CeedScalar * __restrict__ x, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size)
-    return;
-  y[index] += alpha * x[index];
+__global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) {
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) y[index] += alpha * x[index];
 }
 
 //------------------------------------------------------------------------------
 // Compute y = alpha x + y on device
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha,
-    CeedScalar *x_array, CeedSize length) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
-  if (block_size * grid_size < vec_size)
-    grid_size += 1;
-  axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length);
+  if (block_size * grid_size < vec_size) grid_size += 1;
+  axpyValueK<<<grid_size, block_size>>>(y_array, alpha, x_array, length);
   return 0;
 }
 
 //------------------------------------------------------------------------------
 // Kernel for axpby
 //------------------------------------------------------------------------------
-__global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta,
-    CeedScalar * __restrict__ x, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size)
-    return;
-  y[index] = beta * y[index];
-  y[index] += alpha * x[index];
+__global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) {
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) {
+    y[index] = beta * y[index];
+    y[index] += alpha * x[index];
+  }
 }
 
 //------------------------------------------------------------------------------
 // Compute y = alpha x + beta y on device
 //------------------------------------------------------------------------------
-extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta,
-    CeedScalar *x_array, CeedSize length) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
-  if (block_size * grid_size < vec_size)
-    grid_size += 1;
-  axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length);
+  if (block_size * grid_size < vec_size) grid_size += 1;
+  axpbyValueK<<<grid_size, block_size>>>(y_array, alpha, beta, x_array, length);
   return 0;
 }
 
 //------------------------------------------------------------------------------
 // Kernel for pointwise mult
 //------------------------------------------------------------------------------
-__global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w,
-    CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
-  if (index >= size)
-    return;
-  w[index] = x[index] * y[index];
+__global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) {
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) w[index] = x[index] * y[index];
 }
 
 //------------------------------------------------------------------------------
 // Compute the pointwise multiplication w = x .* y on device
 //------------------------------------------------------------------------------
-extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array,
-    CeedScalar *y_array, CeedSize length) {
-  const int block_size = 512;
-  const CeedSize vec_size = length;
-  int grid_size = vec_size / block_size;
+extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
 
-  if (block_size * grid_size < vec_size)
-    grid_size += 1;
-  pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length);
+  if (block_size * grid_size < vec_size) grid_size += 1;
+  pointwiseMultValueK<<<grid_size, block_size>>>(w_array, x_array, y_array, length);
   return 0;
 }
 
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index c22bce82da..885e5f0979 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -12,23 +12,17 @@
 #include <cuda_runtime.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <string.h>
 
 #include "../cuda/ceed-cuda-common.h"
 #include "../cuda/ceed-cuda-compile.h"
 #include "ceed-cuda-shared.h"
 
 //------------------------------------------------------------------------------
-// Device initalization
+// Apply tensor basis
 //------------------------------------------------------------------------------
-int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B);
-int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr);
-int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr);
-
-//------------------------------------------------------------------------------
-// Apply basis
-//------------------------------------------------------------------------------
-int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                     CeedVector v) {
+static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
+                                                CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
   Ceed                   ceed;
   Ceed_Cuda             *ceed_Cuda;
   CeedInt                dim, num_comp;
@@ -45,102 +39,113 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  }
 
   // Apply basis operation
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
       CeedInt P_1d, Q_1d;
 
+      CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
 
-      CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B));
-      void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v};
+      void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};
 
       if (dim == 1) {
-        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
-                                                                                                 1));  // avoid >512 total threads
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1,
+                                                      elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
         }
       } else if (dim == 2) {
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
         // elems_per_block must be at least 1
         CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,
+                                                      thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                      interp_args));
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Cuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,
+                                                      thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                      interp_args));
         }
       }
     } break;
     case CEED_EVAL_GRAD: {
       CeedInt P_1d, Q_1d;
 
+      CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
-      CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
+      CeedInt     thread_1d = CeedIntMax(Q_1d, P_1d);
+      CeedScalar *d_grad_1d = data->d_grad_1d;
 
       if (data->d_collo_grad_1d) {
-        CeedCallBackend(CeedInit_CudaCollocatedGrad(data->d_interp_1d, data->d_collo_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G));
-      } else {
-        CeedCallBackend(CeedInit_CudaGrad(data->d_interp_1d, data->d_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G));
+        d_grad_1d = data->d_collo_grad_1d;
       }
-      void *grad_args[] = {(void *)&num_elem, &data->c_B, &data->c_G, &d_u, &d_v};
+      void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v};
+
       if (dim == 1) {
-        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
-                                                                                                 1));  // avoid >512 total threads
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 1,
+                                                      elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
         }
       } else if (dim == 2) {
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
         // elems_per_block must be at least 1
         CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d,
+                                                      thread_1d, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d,
+                                                      thread_1d, elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
       }
     } break;
@@ -148,23 +153,24 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
       CeedInt Q_1d;
       CeedInt block_size = 32;
 
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
       if (dim == 1) {
         const CeedInt elems_per_block = block_size / Q_1d;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args));
       } else if (dim == 2) {
         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
       } else if (dim == 3) {
         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
       }
@@ -182,6 +188,380 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                            CeedVector v) {
+  CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                               CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Basis apply - tensor AtPoints
+//------------------------------------------------------------------------------
+static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
+                                                  CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  Ceed                   ceed;
+  Ceed_Cuda             *ceed_Cuda;
+  CeedInt                Q_1d, dim, num_comp, max_num_points = num_points[0];
+  const CeedInt          is_transpose = t_mode == CEED_TRANSPOSE;
+  const CeedScalar      *d_x, *d_u;
+  CeedScalar            *d_v;
+  CeedBasis_Cuda_shared *data;
+
+  CeedCallBackend(CeedBasisGetData(basis, &data));
+  CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+
+  // Weight handled separately
+  if (eval_mode == CEED_EVAL_WEIGHT) {
+    CeedCallBackend(CeedVectorSetValue(v, 1.0));
+    return CEED_ERROR_SUCCESS;
+  }
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &ceed_Cuda));
+
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  q_comp;
+    CeedSize len, len_required;
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
+    }
+  }
+
+  // Build kernels if needed
+  if (data->num_points != max_num_points) {
+    CeedInt P_1d;
+
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    data->num_points = max_num_points;
+
+    // -- Create interp matrix to Chebyshev coefficients
+    if (!data->d_chebyshev_interp_1d) {
+      CeedSize    interp_bytes;
+      CeedScalar *chebyshev_interp_1d;
+
+      interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+      CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+      CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
+      CeedCallCuda(ceed, cudaMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
+      CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+    }
+
+    // -- Compile kernels
+    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h>\n";
+    CeedInt    num_comp;
+
+    if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D",
+                                     CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
+                                     "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAddAtPoints", &data->InterpTransposeAddAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
+    CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAddAtPoints", &data->GradTransposeAddAtPoints));
+  }
+
+  // Get read/write access to u, v
+  CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  }
+
+  // Basis action
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      CeedInt P_1d, Q_1d;
+
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
+
+      void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+
+      if (dim == 1) {
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
+
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
+                                                      thread_1d, 1, elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem,
+                                                      interp_args));
+        }
+      } else if (dim == 2) {
+        const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
+        // elems_per_block must be at least 1
+        CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
+                                                      thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                      interp_args));
+        }
+      } else if (dim == 3) {
+        CeedInt elems_per_block = 1;
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
+                                                      thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                      interp_args));
+        }
+      }
+    } break;
+    case CEED_EVAL_GRAD: {
+      CeedInt P_1d, Q_1d;
+
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
+
+      void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+
+      if (dim == 1) {
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
+
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
+                                                      thread_1d, 1, elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+        }
+      } else if (dim == 2) {
+        const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
+        // elems_per_block must be at least 1
+        CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
+                                                      thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                      grad_args));
+        }
+      } else if (dim == 3) {
+        CeedInt elems_per_block = 1;
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
+                                                      thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                      grad_args));
+        }
+      }
+    } break;
+    case CEED_EVAL_WEIGHT:
+    case CEED_EVAL_NONE: /* handled separately below */
+      break;
+    // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+
+  // Restore vectors, cover CEED_EVAL_NONE
+  CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
+  if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                              CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddAtPoints_Cuda_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                                 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Cuda_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Apply non-tensor basis
+//------------------------------------------------------------------------------
+static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
+                                                   CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  Ceed                   ceed;
+  Ceed_Cuda             *ceed_Cuda;
+  CeedInt                dim;
+  const CeedScalar      *d_u;
+  CeedScalar            *d_v;
+  CeedBasis_Cuda_shared *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &ceed_Cuda));
+  CeedCallBackend(CeedBasisGetData(basis, &data));
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+
+  // Get read/write access to u, v
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  }
+
+  // Apply basis operation
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      CeedInt P, Q;
+
+      CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEvalModes[eval_mode]);
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      CeedInt thread = CeedIntMax(Q, P);
+
+      void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};
+
+      {
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1));
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
+
+        if (t_mode == CEED_TRANSPOSE) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1,
+                                                      elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread, 1, elems_per_block, shared_mem, interp_args));
+        }
+      }
+    } break;
+    case CEED_EVAL_GRAD: {
+      CeedInt P, Q;
+
+      CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalModes[eval_mode]);
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      CeedInt thread = CeedIntMax(Q, P);
+
+      void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v};
+
+      {
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1));
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
+
+        if (t_mode == CEED_TRANSPOSE) {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1,
+                                                      elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread, 1, elems_per_block, shared_mem, grad_args));
+        }
+      }
+    } break;
+    case CEED_EVAL_WEIGHT: {
+      CeedInt P, Q;
+
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      CeedInt thread = CeedIntMax(Q, P);
+
+      void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
+
+      {
+        // avoid >512 total threads
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1));
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+
+        CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight_args));
+      }
+    } break;
+    case CEED_EVAL_NONE: /* handled separately below */
+      break;
+    // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+
+  // Restore vectors, cover CEED_EVAL_NONE
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
+  if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                               CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddNonTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                                  CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Cuda_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -195,11 +575,16 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallCuda(ceed, cuModuleUnload(data->module));
-  CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
+  if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
+  if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
   CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d));
+  CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));
   CeedCallBackend(CeedFree(&data));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -209,8 +594,6 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                         const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
   Ceed                   ceed;
-  char                  *basis_kernel_source;
-  const char            *basis_kernel_path;
   CeedInt                num_comp;
   const CeedInt          q_bytes      = Q_1d * sizeof(CeedScalar);
   const CeedInt          interp_bytes = q_bytes * P_1d;
@@ -220,8 +603,10 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(CeedCalloc(1, &data));
 
   // Copy basis data to GPU
-  CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
-  CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice));
+  if (q_weight_1d) {
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice));
+  }
   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes));
   CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice));
   CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes));
@@ -242,27 +627,103 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   }
 
   // Compile basis kernels
+  bool       is_collocated         = false;
+  const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete -----\n");
-  CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
+  CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D",
                                    CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
                                    "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad));
+  CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocated" : "Interp", &data->Interp));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTranspose" : "InterpTranspose", &data->InterpTranspose));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "InterpCollocatedTransposeAdd" : "InterpTransposeAdd",
+                                     &data->InterpTransposeAdd));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocated" : "Grad", &data->Grad));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocatedTranspose" : "GradTranspose", &data->GradTranspose));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, is_collocated ? "GradCollocatedTransposeAdd" : "GradTransposeAdd", &data->GradTransposeAdd));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
+
+  CeedCallBackend(CeedBasisSetData(basis, data));
+
+  // Register backend functions
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Create non-tensor basis
+//------------------------------------------------------------------------------
+int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                                  const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
+  Ceed                   ceed;
+  CeedInt                num_comp, q_comp_interp, q_comp_grad;
+  const CeedInt          q_bytes = num_qpts * sizeof(CeedScalar);
+  CeedBasis_Cuda_shared *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+
+  // Check shared memory size
+  {
+    Ceed_Cuda *cuda_data;
+
+    CeedCallBackend(CeedGetData(ceed, &cuda_data));
+    if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) >
+        cuda_data->device_prop.sharedMemPerBlock) {
+      CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis));
+      CeedCallBackend(CeedDestroy(&ceed));
+      return CEED_ERROR_SUCCESS;
+    }
+  }
+
+  CeedCallBackend(CeedCalloc(1, &data));
+
+  // Copy basis data to GPU
+  CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
+  CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
+  if (q_weight) {
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight, q_bytes, cudaMemcpyHostToDevice));
+  }
+  if (interp) {
+    const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
+
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp, interp_bytes, cudaMemcpyHostToDevice));
+  }
+  if (grad) {
+    const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad;
+
+    CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, grad_bytes));
+    CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad, grad_bytes, cudaMemcpyHostToDevice));
+  }
+
+  // Compile basis kernels
+  const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor.h>\n";
+
+  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+  CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_T_1D",
+                                   CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTranspose", &data->GradTranspose));
+  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
   // Register backend functions
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-shared/ceed-cuda-shared.c b/backends/cuda-shared/ceed-cuda-shared.c
index ef704f7193..1224032995 100644
--- a/backends/cuda-shared/ceed-cuda-shared.c
+++ b/backends/cuda-shared/ceed-cuda-shared.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -24,6 +24,7 @@ static int CeedInit_Cuda_shared(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root));
   CeedCheck(!strcmp(resource_root, "/gpu/cuda/shared"), ceed, CEED_ERROR_BACKEND, "Cuda backend cannot use resource: %s", resource);
+  CeedCallBackend(CeedFree(&resource_root));
   CeedCallBackend(CeedSetDeterministic(ceed, true));
 
   CeedCallBackend(CeedCalloc(1, &data));
@@ -32,8 +33,10 @@ static int CeedInit_Cuda_shared(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedInit("/gpu/cuda/ref", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Cuda_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Cuda_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index ffc70dd6f5..7d67327789 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -14,16 +14,31 @@ typedef struct {
   CUmodule    module;
   CUfunction  Interp;
   CUfunction  InterpTranspose;
+  CUfunction  InterpTransposeAdd;
   CUfunction  Grad;
   CUfunction  GradTranspose;
+  CUfunction  GradTransposeAdd;
   CUfunction  Weight;
+  CUmodule    moduleAtPoints;
+  CeedInt     num_points;
+  CUfunction  InterpAtPoints;
+  CUfunction  InterpTransposeAtPoints;
+  CUfunction  InterpTransposeAddAtPoints;
+  CUfunction  GradAtPoints;
+  CUfunction  GradTransposeAtPoints;
+  CUfunction  GradTransposeAddAtPoints;
   CeedScalar *d_interp_1d;
   CeedScalar *d_grad_1d;
   CeedScalar *d_collo_grad_1d;
   CeedScalar *d_q_weight_1d;
-  CeedScalar *c_B;
-  CeedScalar *c_G;
+  CeedScalar *d_chebyshev_interp_1d;
+  CeedInt     num_elem_at_points;
+  CeedInt    *h_points_per_elem;
+  CeedInt    *d_points_per_elem;
 } CeedBasis_Cuda_shared;
 
 CEED_INTERN int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                                     const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis);
+
+CEED_INTERN int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                                              const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
diff --git a/backends/cuda-shared/kernels/cuda-shared-basis.cu b/backends/cuda-shared/kernels/cuda-shared-basis.cu
deleted file mode 100644
index 3374cd8bb8..0000000000
--- a/backends/cuda-shared/kernels/cuda-shared-basis.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include <ceed.h>
-#include <cuda.h>
-
-const int sizeMax = 16;
-__constant__ CeedScalar c_B[sizeMax*sizeMax];
-__constant__ CeedScalar c_G[sizeMax*sizeMax];
-
-//------------------------------------------------------------------------------
-// Interp device initalization
-//------------------------------------------------------------------------------
-extern "C" int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d,
-                                  CeedScalar **c_B_ptr) {
-  const int bytes = P_1d*Q_1d*sizeof(CeedScalar);
-
-  cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice);
-  cudaGetSymbolAddress((void **)c_B_ptr, c_B);
-  return CEED_ERROR_SUCCESS;
-}
-
-//------------------------------------------------------------------------------
-// Grad device initalization
-//------------------------------------------------------------------------------
-extern "C" int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G,
-    CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) {
-  const int bytes = P_1d*Q_1d*sizeof(CeedScalar);
-
-  cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice);
-  cudaGetSymbolAddress((void **)c_B_ptr, c_B);
-  cudaMemcpyToSymbol(c_G, d_G, bytes, 0, cudaMemcpyDeviceToDevice);
-  cudaGetSymbolAddress((void **)c_G_ptr, c_G);
-  return CEED_ERROR_SUCCESS;
-}
-
-//------------------------------------------------------------------------------
-// Collocated grad device initalization
-//------------------------------------------------------------------------------
-extern "C" int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G,
-    CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) {
-  const int bytes_interp = P_1d*Q_1d*sizeof(CeedScalar);
-  const int bytes_grad = Q_1d*Q_1d*sizeof(CeedScalar);
-
-  cudaMemcpyToSymbol(c_B, d_B, bytes_interp, 0, cudaMemcpyDeviceToDevice);
-  cudaGetSymbolAddress((void **)c_B_ptr, c_B);
-  cudaMemcpyToSymbol(c_G, d_G, bytes_grad, 0, cudaMemcpyDeviceToDevice);
-  cudaGetSymbolAddress((void **)c_G_ptr, c_G);
-  return CEED_ERROR_SUCCESS;
-}
-
-//------------------------------------------------------------------------------
diff --git a/backends/cuda/ceed-cuda-common.c b/backends/cuda/ceed-cuda-common.c
index cae17d11d5..9538a2ee4d 100644
--- a/backends/cuda/ceed-cuda-common.c
+++ b/backends/cuda/ceed-cuda-common.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -53,10 +53,15 @@ static inline int CeedSetDeviceGenericArray_Cuda(Ceed ceed, const void *source_a
                                                  void *target_array_owned, void *target_array_borrowed, void *target_array) {
   switch (copy_mode) {
     case CEED_COPY_VALUES:
-      if (!*(void **)target_array_owned) CeedCallCuda(ceed, cudaMalloc(target_array_owned, size_unit * num_values));
-      if (source_array) CeedCallCuda(ceed, cudaMemcpy(*(void **)target_array_owned, source_array, size_unit * num_values, cudaMemcpyDeviceToDevice));
-      *(void **)target_array_borrowed = NULL;
-      *(void **)target_array          = *(void **)target_array_owned;
+      if (!*(void **)target_array) {
+        if (*(void **)target_array_borrowed) {
+          *(void **)target_array = *(void **)target_array_borrowed;
+        } else {
+          if (!*(void **)target_array_owned) CeedCallCuda(ceed, cudaMalloc(target_array_owned, size_unit * num_values));
+          *(void **)target_array = *(void **)target_array_owned;
+        }
+      }
+      if (source_array) CeedCallCuda(ceed, cudaMemcpy(*(void **)target_array, source_array, size_unit * num_values, cudaMemcpyDeviceToDevice));
       break;
     case CEED_OWN_POINTER:
       CeedCallCuda(ceed, cudaFree(*(void **)target_array_owned));
diff --git a/backends/cuda/ceed-cuda-common.h b/backends/cuda/ceed-cuda-common.h
index 1fc8362717..489374a29a 100644
--- a/backends/cuda/ceed-cuda-common.h
+++ b/backends/cuda/ceed-cuda-common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -66,6 +66,8 @@ static const char *cublasGetErrorName(cublasStatus_t error) {
 
 typedef struct {
   int                   device_id;
+  bool                  use_llvm_version;
+  int                   llvm_version;
   cublasHandle_t        cublas_handle;
   struct cudaDeviceProp device_prop;
 } Ceed_Cuda;
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index b8186ba2d1..d1593dd800 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -11,11 +11,19 @@
 #include <ceed/backend.h>
 #include <ceed/jit-tools.h>
 #include <cuda_runtime.h>
+#include <dirent.h>
 #include <nvrtc.h>
 #include <stdarg.h>
+#include <stdio.h>
 #include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
 #include <sstream>
+#include <string>
 
 #include "ceed-cuda-common.h"
 
@@ -31,15 +39,42 @@
     CeedChk_Nvrtc(ceed, ierr_q_); \
   } while (0)
 
+#define CeedCallSystem(ceed, command, message) CeedCallBackend(CeedCallSystem_Core(ceed, command, message))
+
+//------------------------------------------------------------------------------
+// Call system command and capture stdout + stderr
+//------------------------------------------------------------------------------
+static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *message) {
+  CeedDebug(ceed, "Running command:\n$ %s", command);
+  FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
+
+  CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s", message, command);
+
+  char        line[CEED_MAX_RESOURCE_LEN] = "";
+  std::string output                      = "";
+
+  while (fgets(line, sizeof(line), output_stream) != nullptr) {
+    output += line;
+  }
+  CeedDebug(ceed, "output:\n%s\n", output.c_str());
+  CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s\nerror:\n%s", message, command, output.c_str());
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Compile CUDA kernel
 //------------------------------------------------------------------------------
-int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
+using std::ifstream;
+using std::ofstream;
+using std::ostringstream;
+
+static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module,
+                                const CeedInt num_defines, va_list args) {
   size_t                ptx_size;
   char                 *ptx;
-  const char           *jit_defs_path, *jit_defs_source;
-  const int             num_opts = 3;
-  const char           *opts[num_opts];
+  const int             num_opts            = 4;
+  CeedInt               num_jit_source_dirs = 0, num_jit_defines = 0;
+  const char          **opts;
   nvrtcProgram          prog;
   struct cudaDeviceProp prop;
   Ceed_Cuda            *ceed_data;
@@ -47,11 +82,17 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   cudaFree(0);  // Make sure a Context exists for nvrtc
 
   std::ostringstream code;
+  bool               using_clang;
+
+  CeedCallBackend(CeedGetIsClang(ceed, &using_clang));
+
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS,
+               using_clang ? "Compiling CUDA with Clang backend (with Rust QFunction support)"
+                           : "Compiling CUDA with NVRTC backend (without Rust QFunction support).\nTo use the Clang backend, set the environment "
+                             "variable GPU_CLANG=1");
 
   // Get kernel specific options, such as kernel constants
   if (num_defines > 0) {
-    va_list args;
-    va_start(args, num_defines);
     char *name;
     int   val;
 
@@ -60,59 +101,329 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
       val  = va_arg(args, int);
       code << "#define " << name << " " << val << "\n";
     }
-    va_end(args);
   }
 
   // Standard libCEED definitions for CUDA backends
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-jit.h", &jit_defs_path));
-  {
-    char *source;
-
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, jit_defs_path, &source));
-    jit_defs_source = source;
-  }
-  code << jit_defs_source;
-  code << "\n\n";
-  CeedCallBackend(CeedFree(&jit_defs_path));
-  CeedCallBackend(CeedFree(&jit_defs_source));
+  code << "#include <ceed/jit-source/cuda/cuda-jit.h>\n\n";
 
   // Non-macro options
+  CeedCallBackend(CeedCalloc(num_opts, &opts));
   opts[0] = "-default-device";
   CeedCallBackend(CeedGetData(ceed, &ceed_data));
   CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id));
-  std::string arch_arg = "-arch=compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
-  opts[1]              = arch_arg.c_str();
-  opts[2]              = "-Dint32_t=int";
+  std::string arch_arg =
+#if CUDA_VERSION >= 11010
+      // NVRTC used to support only virtual architectures through the option
+      // -arch, since it was only emitting PTX. It will now support actual
+      // architectures as well to emit SASS.
+      // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation
+      "-arch=sm_"
+#else
+      "-arch=compute_"
+#endif
+      + std::to_string(prop.major) + std::to_string(prop.minor);
+  opts[1] = arch_arg.c_str();
+  opts[2] = "-Dint32_t=int";
+  opts[3] = "-DCEED_RUNNING_JIT_PASS=1";
+  // Additional include dirs
+  {
+    const char **jit_source_dirs;
+
+    CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
+    CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts));
+    for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
+      std::ostringstream include_dir_arg;
+
+      include_dir_arg << "-I" << jit_source_dirs[i];
+      CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i]));
+    }
+    CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
+  }
+  // User defines
+  {
+    const char **jit_defines;
+
+    CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines));
+    CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts));
+    for (CeedInt i = 0; i < num_jit_defines; i++) {
+      std::ostringstream define_arg;
+
+      define_arg << "-D" << jit_defines[i];
+      CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i]));
+    }
+    CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines));
+  }
 
   // Add string source argument provided in call
   code << source;
 
-  // Create Program
-  CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
-
   // Compile kernel
-  nvrtcResult result = nvrtcCompileProgram(prog, num_opts, opts);
-
-  if (result != NVRTC_SUCCESS) {
-    char  *log;
-    size_t log_size;
-
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- CEED JIT SOURCE FAILED TO COMPILE ----------\n");
-    CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- CEED JIT SOURCE FAILED TO COMPILE ----------\n");
-    CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
-    CeedCallBackend(CeedMalloc(log_size, &log));
-    CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
-    return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
+  CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JIT SOURCE ----------\n");
+
+  if (!using_clang) {
+    CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
+
+    if (CeedDebugFlag(ceed)) {
+      // LCOV_EXCL_START
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n");
+      for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
+        CeedDebug(ceed, "Option %d: %s", i, opts[i]);
+      }
+      CeedDebug(ceed, "");
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n");
+      // LCOV_EXCL_STOP
+    }
+
+    nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
+
+    for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
+      CeedCallBackend(CeedFree(&opts[num_opts + i]));
+    }
+    for (CeedInt i = 0; i < num_jit_defines; i++) {
+      CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
+    }
+    CeedCallBackend(CeedFree(&opts));
+    *is_compile_good = result == NVRTC_SUCCESS;
+    if (!*is_compile_good) {
+      char  *log;
+      size_t log_size;
+
+      CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
+      CeedCallBackend(CeedMalloc(log_size, &log));
+      CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
+      if (throw_error) {
+        return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
+      } else {
+        // LCOV_EXCL_START
+        CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
+        CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
+        CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+        CeedCallBackend(CeedFree(&log));
+        CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
+        return CEED_ERROR_SUCCESS;
+        // LCOV_EXCL_STOP
+      }
+    }
+
+#if CUDA_VERSION >= 11010
+    CeedCallNvrtc(ceed, nvrtcGetCUBINSize(prog, &ptx_size));
+    CeedCallBackend(CeedMalloc(ptx_size, &ptx));
+    CeedCallNvrtc(ceed, nvrtcGetCUBIN(prog, ptx));
+#else
+    CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size));
+    CeedCallBackend(CeedMalloc(ptx_size, &ptx));
+    CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx));
+#endif
+    CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
+
+    CeedCallCuda(ceed, cuModuleLoadData(module, ptx));
+    CeedCallBackend(CeedFree(&ptx));
+    return CEED_ERROR_SUCCESS;
+  } else {
+    srand(time(NULL));
+    const int build_id = rand();
+
+    // Create temp dir if needed
+    {
+      DIR *dir = opendir("temp");
+
+      if (dir) {
+        closedir(dir);
+      } else {
+        // In parallel multiple processes may attempt
+        // Only one process needs to succeed
+        mkdir("temp", 0777);
+        chmod("temp", 0777);
+      }
+    }
+    // Write code to temp file
+    {
+      std::string filename = std::string("temp/kernel_") + std::to_string(build_id) + std::string("_0_source.cu");
+      FILE       *file     = fopen(filename.c_str(), "w");
+
+      CeedCheck(file, ceed, CEED_ERROR_BACKEND, "Failed to create file. Write access is required for cuda-clang");
+      fputs(code.str().c_str(), file);
+      fclose(file);
+    }
+
+    // Get rust crate directories
+    const char **rust_source_dirs     = nullptr;
+    int          num_rust_source_dirs = 0;
+
+    CeedCallBackend(CeedGetRustSourceRoots(ceed, &num_rust_source_dirs, &rust_source_dirs));
+
+    std::string rust_dirs[10];
+
+    if (num_rust_source_dirs > 0) {
+      CeedDebug(ceed, "There are %d source dirs, including %s\n", num_rust_source_dirs, rust_source_dirs[0]);
+    }
+
+    for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
+      rust_dirs[i] = std::string(rust_source_dirs[i]);
+    }
+
+    CeedCallBackend(CeedRestoreRustSourceRoots(ceed, &rust_source_dirs));
+
+    char *rust_toolchain = std::getenv("RUST_TOOLCHAIN");
+
+    if (rust_toolchain == nullptr) {
+      rust_toolchain = (char *)"nightly";
+      setenv("RUST_TOOLCHAIN", "nightly", 0);
+    }
+
+    // Compile Rust crate(s) needed
+    std::string command;
+
+    for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
+      command = "cargo +" + std::string(rust_toolchain) + " build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] +
+                "/.cargo/config.toml --manifest-path " + rust_dirs[i] + "/Cargo.toml";
+      CeedCallSystem(ceed, command.c_str(), "build Rust crate");
+    }
+
+    // Get Clang version
+    bool use_llvm_version = ceed_data->use_llvm_version;
+    int  llvm_version     = ceed_data->llvm_version;
+
+    if (llvm_version == 0) {
+      command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) --version";
+      CeedDebug(ceed, "Attempting to detect Rust LLVM version.\ncommand:\n$ %s", command.c_str());
+      FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
+
+      CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version");
+
+      char        line[CEED_MAX_RESOURCE_LEN] = "";
+      std::string output                      = "";
+
+      while (fgets(line, sizeof(line), output_stream) != nullptr) {
+        output += line;
+      }
+      CeedDebug(ceed, "output:\n%s", output.c_str());
+      CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version\ncommand:\n$ %s\nerror:\n%s",
+                command.c_str(), output.c_str());
+
+      const char *version_substring = strstr(output.c_str(), "LLVM version ");
+
+      version_substring += 13;
+
+      char *next_dot = strchr((char *)version_substring, '.');
+
+      if (next_dot) {
+        next_dot[0]             = '\0';
+        ceed_data->llvm_version = llvm_version = std::stoi(version_substring);
+        CeedDebug(ceed, "Rust LLVM version number: %d\n", llvm_version);
+
+        command                     = std::string("clang++-") + std::to_string(llvm_version);
+        output_stream               = popen((command + std::string(" 2>&1")).c_str(), "r");
+        ceed_data->use_llvm_version = use_llvm_version = pclose(output_stream) == 0;
+      } else {
+        ceed_data->llvm_version     = -1;
+        ceed_data->use_llvm_version = use_llvm_version = false;
+      }
+    }
+
+    // Compile wrapper kernel
+    command = "clang++" + (use_llvm_version ? (std::string("-") + std::to_string(llvm_version)) : "") + " -flto=thin --cuda-gpu-arch=sm_" +
+              std::to_string(prop.major) + std::to_string(prop.minor) + " --cuda-device-only -emit-llvm -S temp/kernel_" + std::to_string(build_id) +
+              "_0_source.cu -o temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll ";
+    command += opts[4];
+    CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
+    CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll").c_str(), "update JiT file permissions");
+
+    // Find Rust's llvm-link tool and run it
+    command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) temp/kernel_" +
+              std::to_string(build_id) +
+              "_1_wrapped.ll --ignore-non-bitcode --internalize --only-needed -S -o "
+              "temp/kernel_" +
+              std::to_string(build_id) + "_2_linked.ll ";
+
+    // Searches for .a files in Rust directory
+    // Note: Rust crate names may not match the folder they are in
+    // TODO: If libCEED switches to c++17, use std::filesystem here
+    for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
+      std::string dir = rust_dirs[i] + "/target/nvptx64-nvidia-cuda/release";
+      DIR        *dp  = opendir(dir.c_str());
+
+      CeedCheck(dp != nullptr, ceed, CEED_ERROR_BACKEND, "Could not open directory: %s", dir.c_str());
+      struct dirent *entry;
+
+      // Find files ending in .a
+      while ((entry = readdir(dp)) != nullptr) {
+        std::string filename(entry->d_name);
+
+        if (filename.size() >= 2 && filename.substr(filename.size() - 2) == ".a") {
+          command += dir + "/" + filename + " ";
+        }
+      }
+      closedir(dp);
+    }
+
+    // Link, optimize, and compile final CUDA kernel
+    CeedCallSystem(ceed, command.c_str(), "link C and Rust source");
+    CeedCallSystem(ceed,
+                   ("$(find $(rustup run " + std::string(rust_toolchain) +
+                    " rustc --print sysroot) -name opt) --passes internalize,inline temp/kernel_" + std::to_string(build_id) +
+                    "_2_linked.ll -o temp/kernel_" + std::to_string(build_id) + "_3_opt.bc")
+                       .c_str(),
+                   "optimize linked C and Rust source");
+    CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_2_linked.ll").c_str(), "update JiT file permissions");
+    CeedCallSystem(ceed,
+                   ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" +
+                    std::to_string(prop.major) + std::to_string(prop.minor) + " temp/kernel_" + std::to_string(build_id) +
+                    "_3_opt.bc -o temp/kernel_" + std::to_string(build_id) + "_4_final.ptx")
+                       .c_str(),
+                   "compile final CUDA kernel");
+    CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_4_final.ptx").c_str(), "update JiT file permissions");
+
+    // Load module from final PTX
+    ifstream      ptxfile("temp/kernel_" + std::to_string(build_id) + "_4_final.ptx");
+    ostringstream sstr;
+
+    sstr << ptxfile.rdbuf();
+
+    auto ptx_data = sstr.str();
+    ptx_size      = ptx_data.length();
+
+    int result = cuModuleLoadData(module, ptx_data.c_str());
+
+    *is_compile_good = result == 0;
+    if (!*is_compile_good) {
+      if (throw_error) {
+        return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to load module data");
+      } else {
+        // LCOV_EXCL_START
+        CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
+        CeedDebug(ceed, "Error: Failed to load module data");
+        CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+        return CEED_ERROR_SUCCESS;
+        // LCOV_EXCL_STOP
+      }
+    }
   }
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
+  bool    is_compile_good = true;
+  va_list args;
+
+  va_start(args, num_defines);
+  const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args);
+
+  va_end(args);
+  CeedCallBackend(ierr);
+  return CEED_ERROR_SUCCESS;
+}
 
-  CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size));
-  CeedCallBackend(CeedMalloc(ptx_size, &ptx));
-  CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx));
-  CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
+int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...) {
+  va_list args;
 
-  CeedCallCuda(ceed, cuModuleLoadData(module, ptx));
-  CeedCallBackend(CeedFree(&ptx));
+  va_start(args, num_defines);
+  const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args);
+
+  va_end(args);
+  CeedCallBackend(ierr);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -141,7 +452,7 @@ int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points, void
 // Run CUDA kernel
 //------------------------------------------------------------------------------
 int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) {
-  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, grid_size, block_size, 1, 1, 0, args));
+  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size, 1, 1, 0, args));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -150,19 +461,20 @@ int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const
 //------------------------------------------------------------------------------
 int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z,
                           void **args) {
-  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, 0, args));
+  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size_x, block_size_y, block_size_z, 0, args));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Run CUDA kernel for spatial dimension with shared memory
 //------------------------------------------------------------------------------
-int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
-                                const int block_size_z, const int shared_mem_size, void **args) {
+static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x,
+                                           const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error,
+                                           bool *is_good_run, void **args) {
 #if CUDA_VERSION >= 9000
   cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
 #endif
-  CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL);
+  CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
 
   if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
     int max_threads_per_block, shared_size_bytes, num_regs;
@@ -170,11 +482,37 @@ int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_siz
     cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
     cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
     cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
-    return CeedError(ceed, CEED_ERROR_BACKEND,
-                     "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
-                     max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
+    if (throw_error) {
+      return CeedError(ceed, CEED_ERROR_BACKEND,
+                       "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
+                       max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
+    } else {
+      // LCOV_EXCL_START
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
+      CeedDebug(ceed, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d\n",
+                max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
+      // LCOV_EXCL_STOP
+    }
+    *is_good_run = false;
   } else CeedChk_Cu(ceed, result);
   return CEED_ERROR_SUCCESS;
 }
 
+int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
+                                const int block_size_z, const int shared_mem_size, void **args) {
+  bool is_good_run = true;
+
+  CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true,
+                                                  &is_good_run, args));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
+                                   const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
+  CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false,
+                                                  is_good_run, args));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
diff --git a/backends/cuda/ceed-cuda-compile.h b/backends/cuda/ceed-cuda-compile.h
index 846de28c9d..151f0e0a24 100644
--- a/backends/cuda/ceed-cuda-compile.h
+++ b/backends/cuda/ceed-cuda-compile.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -13,6 +13,7 @@
 static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; }
 
 CEED_INTERN int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...);
+CEED_INTERN int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...);
 
 CEED_INTERN int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel);
 
@@ -22,5 +23,7 @@ CEED_INTERN int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t
 
 CEED_INTERN int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z, void **args);
 
-CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
-                                            int shared_mem_size, void **args);
+CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, int grid_size, int block_size_x, int block_size_y,
+                                            int block_size_z, int shared_mem_size, void **args);
+CEED_INTERN int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, int grid_size, int block_size_x, int block_size_y,
+                                               int block_size_z, int shared_mem_size, bool *is_good_run, void **args);
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index c4878a5fed..d2261d6f1b 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -9,8 +9,10 @@
 
 #include <ceed.h>
 #include <ceed/backend.h>
+#include <ceed/gen-tools.h>
 #include <ceed/jit-tools.h>
 
+#include <cstring>
 #include <iostream>
 #include <sstream>
 #include <string>
@@ -21,370 +23,494 @@
 #include "../hip/ceed-hip-compile.h"
 #include "ceed-hip-gen.h"
 
+struct FieldReuse_Hip {
+  CeedInt      index;
+  bool         is_input;
+  CeedEvalMode eval_mode;
+};
+
 //------------------------------------------------------------------------------
 // Calculate the block size used for launching the operator kernel
 //------------------------------------------------------------------------------
 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
-  const CeedInt thread1d = CeedIntMax(Q_1d, P_1d);
+  const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
   if (dim == 1) {
-    CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64;
+    CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
 
     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
-    block_sizes[0]  = thread1d;
+    block_sizes[0]  = thread_1d;
     block_sizes[1]  = 1;
     block_sizes[2]  = elems_per_block;
   } else if (dim == 2) {
-    const CeedInt elems_per_block = thread1d < 4 ? 16 : 2;
+    const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
 
-    block_sizes[0] = thread1d;
-    block_sizes[1] = thread1d;
+    block_sizes[0] = thread_1d;
+    block_sizes[1] = thread_1d;
     block_sizes[2] = elems_per_block;
   } else if (dim == 3) {
-    const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1);
+    const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
 
-    block_sizes[0] = thread1d;
-    block_sizes[1] = thread1d;
+    block_sizes[0] = thread_1d;
+    block_sizes[1] = thread_1d;
     block_sizes[2] = elems_per_block;
   }
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
-// Build single operator kernel
+// Determine type of operator
 //------------------------------------------------------------------------------
-extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
-  using std::ostringstream;
-  using std::string;
-
-  Ceed                     ceed;
-  bool                     is_setup_done, is_identity_qf;
-  CeedSize                 l_size;
-  CeedInt                  Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1;
-  CeedEvalMode             eval_mode;
-  CeedElemRestriction      elem_rstr;
-  CeedElemRestriction_Hip *rstr_data;
-  CeedBasis                basis;
-  CeedBasis_Hip_shared    *basis_data;
-  CeedQFunctionField      *qf_input_fields, *qf_output_fields;
-  CeedQFunction_Hip_gen   *qf_data;
-  CeedQFunction            qf;
-  CeedOperatorField       *op_input_fields, *op_output_fields;
-  CeedOperator_Hip_gen    *data;
-
-  CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
-  if (is_setup_done) return CEED_ERROR_SUCCESS;
-
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &data));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
-  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-  Q_1d = Q;
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
+                                               CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
+                                               CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
+                                               CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
+  // Check if all are tensor
+  *is_all_tensor = true;
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedBasis basis;
 
-  // TODO: put in a function?
-  // Check for restriction only identity operator
-  CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
-  if (is_identity_qf) {
-    CeedEvalMode eval_mode_in, eval_mode_out;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+    if (basis != CEED_BASIS_NONE) {
+      bool is_field_tensor;
 
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
-    CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
-              "Backend does not implement restriction only identity operators");
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      *is_all_tensor = *is_all_tensor && is_field_tensor;
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedBasis basis;
 
-  ostringstream code;
-
-  // Load basis source files
-  // TODO: generalize to accept different device functions?
-  {
-    char       *tensor_basis_kernel_source;
-    const char *tensor_basis_kernel_path;
-
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source));
-    code << tensor_basis_kernel_source;
-    CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
-    CeedCallBackend(CeedFree(&tensor_basis_kernel_source));
-  }
-  {
-    char       *hip_gen_template_source;
-    const char *hip_gen_template_path;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+    if (basis != CEED_BASIS_NONE) {
+      bool is_field_tensor;
 
-    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-gen-templates.h", &hip_gen_template_path));
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Hip-Gen Template Source -----\n");
-    CeedCallBackend(CeedLoadSourceToBuffer(ceed, hip_gen_template_path, &hip_gen_template_source));
-    code << hip_gen_template_source;
-    CeedCallBackend(CeedFree(&hip_gen_template_path));
-    CeedCallBackend(CeedFree(&hip_gen_template_source));
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      *is_all_tensor = *is_all_tensor && is_field_tensor;
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
 
-  // Get QFunction source and name
-  string qfunction_source(qf_data->qfunction_source);
-  string qfunction_name(qf_data->qfunction_name);
-  string operator_name;
-  operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
+  // Find max_P, max_P_1d, Q, and Q_1d
+  bool is_all_3d = true;
 
-  // Find dim, P_1d, Q_1d
-  data->max_P_1d = 0;
+  *max_P    = 0;
+  *max_P_1d = 0;
+  *Q        = 0;
+  *Q_1d     = 0;
   for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedBasis basis;
+
     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
-      bool is_tensor;
+      bool    is_field_tensor;
+      CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
 
-      CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      // Check if 3D
+      CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
+      is_all_3d = is_all_3d && (field_dim == 3);
+      *max_dim  = CeedIntMax(*max_dim, field_dim);
 
-      // Collect dim, P_1d, and Q_1d
-      CeedCallBackend(CeedBasisGetDimension(basis, &dim));
-      CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-      CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
-      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-      if (P_1d > data->max_P_1d) data->max_P_1d = P_1d;
+      // Collect P, P_1d, Q, and Q_1d
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
+      *max_P = CeedIntMax(*max_P, field_P);
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+        *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      }
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
+      CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *Q = field_Q;
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+        CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+        *Q_1d = field_Q_1d;
+      }
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
-  // Check output bases for Q_1d, dim as well
-  //   The only input basis might be CEED_BASIS_NONE
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+    CeedBasis basis;
 
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
-      bool is_tensor;
+      bool    is_field_tensor;
+      CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
 
-      CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Check if 3D
+      CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
+      is_all_3d = is_all_3d && (field_dim == 3);
+      *max_dim  = CeedIntMax(*max_dim, field_dim);
 
-      // Collect Q_1d
-      CeedCallBackend(CeedBasisGetDimension(basis, &dim));
-      CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-      CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
-      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      // Collect P, P_1d, Q, and Q_1d
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
+      *max_P = CeedIntMax(*max_P, field_P);
+      CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
+        *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
+      }
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
+      CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+      *Q = field_Q;
+      if (is_field_tensor) {
+        CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
+        CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
+        *Q_1d = field_Q_1d;
+      }
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
-  data->dim  = dim;
-  data->Q_1d = Q_1d;
 
   // Only use 3D collocated gradient parallelization strategy when gradient is computed
-  // TODO: put in a function?
-  bool use_collograd_parallelization = false;
-
-  if (dim == 3) {
+  *use_3d_slices = false;
+  if (is_all_3d && *is_all_tensor) {
     bool was_grad_found = false;
 
     for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedEvalMode eval_mode;
+
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
+        CeedBasis_Hip_shared *basis_data;
+        CeedBasis             basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        use_collograd_parallelization = basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
-        was_grad_found                = true;
+        *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
+        was_grad_found = true;
+        CeedCallBackend(CeedBasisDestroy(&basis));
       }
     }
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedEvalMode eval_mode;
+
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
+        CeedBasis_Hip_shared *basis_data;
+        CeedBasis             basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        use_collograd_parallelization = basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
-        was_grad_found                = true;
+        *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
+        was_grad_found = true;
+        CeedCallBackend(CeedBasisDestroy(&basis));
       }
     }
   }
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Define CEED_Q_VLA
-  code << "\n#undef CEED_Q_VLA\n";
-  if (dim != 3 || use_collograd_parallelization) {
-    code << "#define CEED_Q_VLA 1\n\n";
-  } else {
-    code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
-  }
+//------------------------------------------------------------------------------
+// Setup fields
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
+                                                    CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
+                                                    CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
+                                                    bool use_3d_slices, bool skip_active_load) {
+  bool      is_tensor = true, is_active = true;
+  CeedBasis basis;
 
-  code << qfunction_source;
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+  {
+    CeedVector vec;
 
-  // Setup
-  code << "\n// -----------------------------------------------------------------------------\n";
-  code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
-  code << "__global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W) {\n";
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << "  const CeedScalar* d_u_" << i << " = fields.inputs[" << i << "];\n";
-    }
+    CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "  CeedScalar* d_v_" << i << " = fields.outputs[" << i << "];\n";
-  }
+  const char           *field_name;
+  std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
+  std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
+  std::string           option_name = (is_input ? "inputs" : "outputs");
+  CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
+  CeedInt               elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
+  CeedElemRestriction   elem_rstr;
+  CeedBasis_Hip_shared *basis_data;
 
-  code << "  const CeedInt dim = " << dim << ";\n";
-  code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
+  // Field reuse info
+  bool use_previous_field = field_reuse.index != -1;
 
-  code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
-  // TODO put in a function? InitSharedData_Hip?
-  code << "  SharedData_Hip data;\n";
-  code << "  data.t_id_x = threadIdx.x;\n";
-  code << "  data.t_id_y = threadIdx.y;\n";
-  code << "  data.t_id_z = threadIdx.z;\n";
-  code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
-  code << "  data.slice = slice+data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
+  CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
+  code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
 
-  code << "\n  // -- Input field constants and basis data --\n";
-  // TODO: Put in a function?
-  // Initialize constants, and matrices B and G
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "  // ---- Input field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+  // Get field data
+  CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
+  if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+  }
+  CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+  if (basis != CEED_BASIS_NONE) {
+    CeedCallBackend(CeedBasisGetData(basis, &basis_data));
+    CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+    if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
+  }
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
-    // Set field constants
-    if (eval_mode != CEED_EVAL_WEIGHT) {
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-      if (basis != CEED_BASIS_NONE) {
-        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-        code << "  const CeedInt P_in_" << i << " = " << P_1d << ";\n";
+  // Set field constants
+  code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
+  if (is_tensor && !is_all_tensor) {
+    CeedInt P = 0;
+
+    CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+    code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
+  }
+  code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
+  if (eval_mode != CEED_EVAL_WEIGHT) {
+    code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
+  }
+
+  // Load basis data
+  code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  switch (eval_mode) {
+    case CEED_EVAL_NONE:
+      break;
+    case CEED_EVAL_INTERP:
+      if (is_at_points) {
+        // AtPoints
+        if (!basis_data->d_chebyshev_interp_1d) {
+          CeedSize    interp_bytes;
+          CeedScalar *chebyshev_interp_1d;
+
+          interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+          CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+          CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+          CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
+          CeedCallHip(CeedBasisReturnCeed(basis),
+                      hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
+          CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+        }
+        if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
+        else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
       } else {
-        code << "  const CeedInt P_in_" << i << " = " << Q_1d << ";\n";
+        // Standard quadrature
+        if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+        else data->B.outputs[i] = basis_data->d_interp_1d;
       }
-      code << "  const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n";
-    }
+      if (use_previous_field && !skip_active_load) {
+        std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-    // Load basis data
-    code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        break;
-      case CEED_EVAL_INTERP:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.inputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_in_" << i << ",Q_1d>(data, B.inputs[" << i << "], s_B_in_" << i << ");\n";
-        break;
-      case CEED_EVAL_GRAD:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.inputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_in_" << i << ",Q_1d>(data, B.inputs[" << i << "], s_B_in_" << i << ");\n";
-        if (use_collograd_parallelization) {
-          data->G.inputs[i] = basis_data->d_collo_grad_1d;
-          code << "  __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n";
-          code << "  loadMatrix<Q_1d,Q_1d>(data, G.inputs[" << i << "], s_G_in_" << i << ");\n";
+        code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
+      } else {
+        bool is_collocated = false;
+
+        CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+        if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
+          code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
         } else {
-          bool has_collo_grad = basis_data->d_collo_grad_1d;
-          data->G.inputs[i]   = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
-          code << "  __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
-          code << "  loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_in_" + std::to_string(i))) << ",Q_1d>(data, G.inputs[" << i << "], s_G_in_" << i
-               << ");\n";
+          code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+          code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
         }
-        break;
-      case CEED_EVAL_WEIGHT:
-        break;  // No action
-      case CEED_EVAL_DIV:
-        break;  // TODO: Not implemented
-      case CEED_EVAL_CURL:
-        break;  // TODO: Not implemented
-    }
-  }
-
-  code << "\n  // -- Output field constants and basis data --\n";
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "  // ---- Output field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      }
+      break;
+    case CEED_EVAL_GRAD:
+      if (is_at_points) {
+        // AtPoints
+        if (!basis_data->d_chebyshev_interp_1d) {
+          CeedSize    interp_bytes;
+          CeedScalar *chebyshev_interp_1d;
 
-    // Set field constants
-    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-    if (basis != CEED_BASIS_NONE) {
-      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-      code << "  const CeedInt P_out_" << i << " = " << P_1d << ";\n";
-    } else {
-      code << "  const CeedInt P_out_" << i << " = " << Q_1d << ";\n";
-    }
-    code << "  const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n";
+          interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+          CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+          CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+          CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
+          CeedCallHip(CeedBasisReturnCeed(basis),
+                      hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
+          CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+        }
+        if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
+        else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
+      } else {
+        // Standard quadrature
+        if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+        else data->B.outputs[i] = basis_data->d_interp_1d;
+      }
+      if (is_tensor) {
+        if (use_previous_field && !skip_active_load) {
+          std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
 
-    // Load basis data
-    code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        break;  // No action
-      case CEED_EVAL_INTERP:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.outputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_out_" << i << ",Q_1d>(data, B.outputs[" << i << "], s_B_out_" << i << ");\n";
-        break;
-      case CEED_EVAL_GRAD:
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->B.outputs[i] = basis_data->d_interp_1d;
-        code << "  __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
-        code << "  loadMatrix<P_out_" << i << ",Q_1d>(data, B.outputs[" << i << "], s_B_out_" << i << ");\n";
-        if (use_collograd_parallelization) {
-          data->G.outputs[i] = basis_data->d_collo_grad_1d;
-          code << "  __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n";
-          code << "  loadMatrix<Q_1d,Q_1d>(data, G.outputs[" << i << "], s_G_out_" << i << ");\n";
+          code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
         } else {
-          bool has_collo_grad = basis_data->d_collo_grad_1d;
-          data->G.outputs[i]  = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
-          code << "  __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
-          code << "  loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_out_" + std::to_string(i))) << ",Q_1d>(data, G.outputs[" << i << "], s_G_out_"
-               << i << ");\n";
+          bool is_collocated = false;
+
+          CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+          if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
+            code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
+          } else {
+            code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
+            code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+          }
         }
-        break;
-      // LCOV_EXCL_START
-      case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
-        break;  // Should not occur
       }
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
-        break;  // Should not occur
+      if (is_at_points) break;  // No G mat for AtPoints
+      if (use_3d_slices) {
+        if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
+        else data->G.outputs[i] = basis_data->d_collo_grad_1d;
+        if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
+          std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
+
+          code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+        } else if (is_active && skip_active_load) {
+          code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
+        } else {
+          code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
+          code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+        }
+      } else {
+        bool has_collo_grad = basis_data->d_collo_grad_1d;
+
+        if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
+        else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
+        if (has_collo_grad) {
+          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
+            std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
+
+            code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          } else if (is_active && skip_active_load) {
+            code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
+          } else {
+            code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
+            code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          }
+        } else {
+          if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
+            std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
+
+            code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
+          } else if (is_active && skip_active_load) {
+            code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
+          } else {
+            code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
+                 << (is_tensor ? "" : var_suffix) << "];\n";
+            code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
+                 << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
+          }
+        }
       }
-        // LCOV_EXCL_STOP
-    }
+      break;
+    case CEED_EVAL_WEIGHT:
+      break;  // No action
+      // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      break;  // TODO: Not implemented
+              // LCOV_EXCL_STOP
   }
-  code << "\n  // -- Element loop --\n";
-  code << "  __syncthreads();\n";
-  code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
-  // Input basis apply if needed
-  // Generate the correct eval mode code for each input
-  code << "    // -- Input field restrictions and basis actions --\n";
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "    // ---- Input field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+  CeedCallBackend(CeedBasisDestroy(&basis));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Restriction
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
+                                                      CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
+                                                      CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
+                                                      bool use_3d_slices) {
+  std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
+  std::string              P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
+  CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
+  CeedInt                  elem_size = 0, num_comp = 0;
+  CeedSize                 l_size;
+  CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
+  CeedElemRestriction_Hip *rstr_data;
+  CeedElemRestriction      elem_rstr;
+
+  // Get field data
+  CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
+  if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
+    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
+  }
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
 
-    // Restriction
-    if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) {
-      bool is_strided;
+  // Restriction
+  if (is_input) {
+    // Input
+    if (field_input_buffer[i] != i) {
+      std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
 
-      code << "    CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n";
+      // Restriction was already done for previous input
+      code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
+    } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
+      if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
+        // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
+        code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
+      } else if (rstr_type != CEED_RESTRICTION_POINTS) {
+        // Otherwise we're using the scratch space
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+      }
+      switch (rstr_type) {
+        case CEED_RESTRICTION_STANDARD: {
+          CeedInt comp_stride;
 
-      CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-      if (!is_strided) {
-        CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-        code << "    const CeedInt l_size_in_" << i << " = " << l_size << ";\n";
+          CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+          code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+          CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+          code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
+          code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
+               << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
+               << ");\n";
+          break;
+        }
+        case CEED_RESTRICTION_STRIDED: {
+          bool    has_backend_strides;
+          CeedInt num_elem;
+
+          CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
+          CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
+          CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+
+          if (!has_backend_strides) {
+            CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
+          }
+          code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
+               << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
+          code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
+               << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
+               << var_suffix << ");\n";
+          break;
+        }
+        case CEED_RESTRICTION_POINTS: {
+          CeedInt comp_stride;
+
+          CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+          code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
+          break;
+        }
+        // LCOV_EXCL_START
+        case CEED_RESTRICTION_ORIENTED:
+        case CEED_RESTRICTION_CURL_ORIENTED:
+          break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+  } else {
+    // Output
+    switch (rstr_type) {
+      case CEED_RESTRICTION_STANDARD: {
         CeedInt comp_stride;
+
+        CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+        code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-        code << "    // CompStride: " << comp_stride << "\n";
-        CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
-        data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-        code << "    readDofsOffset" << dim << "d<num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ">(data, l_size_in_" << i
-             << ", elem, indices.inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n";
-      } else {
+        code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+        data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
+        code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
+             << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
+             << ");\n";
+        break;
+      }
+      case CEED_RESTRICTION_STRIDED: {
         bool    has_backend_strides;
         CeedInt num_elem;
 
@@ -395,334 +521,2202 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
         if (!has_backend_strides) {
           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
         }
-        code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-        code << "    readDofsStrided" << dim << "d<num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
-             << ">(data, elem, d_u_" << i << ", r_u_" << i << ");\n";
+        code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
+             << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
+        code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
+             << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
+             << ");\n";
+        break;
       }
+      case CEED_RESTRICTION_POINTS:
+        data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
+        break;
+      // LCOV_EXCL_START
+      case CEED_RESTRICTION_ORIENTED:
+      case CEED_RESTRICTION_CURL_ORIENTED:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
     }
+  }
+  CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Basis
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
+                                                CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
+                                                bool is_at_points, bool use_3d_slices) {
+  bool      is_tensor = true, is_collocated = true;
+  CeedBasis basis;
+  CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
+  CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+  CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
 
-    // TODO: put in a function?
-    // Basis action
-    code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
+  std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
+  CeedEvalMode        eval_mode = CEED_EVAL_NONE;
+  CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
+  CeedElemRestriction elem_rstr;
+
+  // Get field data
+  CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
+  if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+  }
+  CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+  if (basis != CEED_BASIS_NONE) {
+    CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+    if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
+  }
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
+
+  // Basis
+  code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+  if (is_input) {
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        if (!use_collograd_parallelization) {
-          code << "    CeedScalar* r_t_" << i << " = r_u_" << i << ";\n";
+        if (!use_3d_slices && !is_at_points) {
+          code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
         }
         break;
       case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n";
-        code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_in_" << i << ",P_in_" << i << ",Q_1d>(data, r_u_" << i << ", s_B_in_"
-             << i << ", r_t_" << i << ");\n";
+        if (is_at_points) {
+          std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
+
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
+        } else {
+          std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
+                                                   std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                                                : "InterpNonTensor";
+          std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
+
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+               << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+        }
         break;
       case CEED_EVAL_GRAD:
-        if (use_collograd_parallelization) {
-          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_in_" << i << ",P_in_" << i << ",Q_1d>(data, r_u_" << i
-               << ", s_B_in_" << i << ", r_t_" << i << ");\n";
+        if (is_at_points) {
+          std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
+
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
+        } else if (use_3d_slices) {
+          std::string function_name =
+              (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d";
+
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
+               << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+        } else if (is_tensor) {
+          bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
+          std::string function_name =
+              (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
+              std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
+          std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
+
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
+               << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+               << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
-          CeedInt P_1d;
-          CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-          CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*dim*Q_1d];\n";
-          code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp_in_" << i
-               << ",P_in_" << i << ",Q_1d>(data, r_u_" << i << ", s_B_in_" << i << ", s_G_in_" << i << ", r_t_" << i << ");\n";
+          std::string function_name = "GradNonTensor";
+
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+               << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
-      case CEED_EVAL_WEIGHT:
-        code << "    CeedScalar r_t_" << i << "[Q_1d];\n";
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        CeedCallBackend(CeedBasisGetData(basis, &basis_data));
-        data->W = basis_data->d_q_weight_1d;
-        code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<Q_1d>(data, W, r_t_" << i << ");\n";
-        break;  // No action
+      case CEED_EVAL_WEIGHT: {
+        if (is_at_points) {
+          code << tab << "// Nothing to do AtPoints\n";
+        } else {
+          CeedBasis_Hip_shared *basis_data;
+          std::string           function_name = is_tensor
+                                                    ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                                                    : "WeightNonTensor";
+
+          code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
+          CeedCallBackend(CeedBasisGetData(basis, &basis_data));
+          data->W = basis_data->d_q_weight_1d;
+          code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
+        }
+        break;
+      }
+      // LCOV_EXCL_START
       case CEED_EVAL_DIV:
-        break;  // TODO: Not implemented
       case CEED_EVAL_CURL:
         break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
     }
-  }
+  } else {
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
+        break;  // No action
+      case CEED_EVAL_INTERP:
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+        if (is_at_points) {
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-  // TODO: put in a function + separate collograd logic
-  // Q function
-  code << "\n    // -- Output field setup --\n";
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "\n    // ---- Output field " << i << " ----\n";
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_GRAD) {
-      if (use_collograd_parallelization) {
-        // Accumulator for gradient slices
-        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1d];\n";
-        code << "    for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n";
-        code << "      for (CeedInt j = 0; j < Q_1d; ++j) {\n";
-        code << "        r_tt_" << i << "[j + i*Q_1d] = 0.0;\n";
-        code << "      }\n";
-        code << "    }\n";
-      } else {
-        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*dim*Q_1d];\n";
-      }
-    }
-    if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
-      code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1d];\n";
-    }
-  }
-  // We treat quadrature points per slice in 3d to save registers
-  if (use_collograd_parallelization) {
-    code << "\n    // Note: Using planes of 3D elements\n";
-    code << "#pragma unroll\n";
-    code << "    for (CeedInt q = 0; q < Q_1d; q++) {\n";
-    code << "      // -- Input fields --\n";
-    for (CeedInt i = 0; i < num_input_fields; i++) {
-      code << "      // ---- Input field " << i << " ----\n";
-      // Get elem_size, eval_mode, num_comp
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-      // Basis action
-      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-      switch (eval_mode) {
-        case CEED_EVAL_NONE:
-          bool is_strided;
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else {
+          std::string function_name =
+              is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
+                           std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
+                        : "InterpTransposeNonTensor";
+          std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+               << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        }
+        break;
+      case CEED_EVAL_GRAD:
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
+        if (is_at_points) {
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
 
-          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-          CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-          if (!is_strided) {
-            CeedInt comp_stride;
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else if (use_3d_slices) {
+          std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
+                                      std::to_string(dim) + "d";
 
-            CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-            code << "      const CeedInt l_size_in_" << i << " = " << l_size << ";\n";
-            CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-            code << "      // CompStride: " << comp_stride << "\n";
-            CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
-            data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
-            code << "      readSliceQuadsOffset"
-                 << "3d<num_comp_in_" << i << ", " << comp_stride << ", Q_1d>(data, l_size_in_" << i << ", elem, q, indices.inputs[" << i << "], d_u_"
-                 << i << ", r_q_" << i << ");\n";
-          } else {
-            bool    has_backend_strides;
-            CeedInt num_elem;
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
+               << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else if (is_tensor) {
+          bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
+          std::string function_name =
+              (dim == 1 ? "GradTranspose"
+                        : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
+              std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
+          std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-            CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-            CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
-            CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
-            CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+          code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+               << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+        } else {
+          std::string function_name = "GradTransposeNonTensor";
 
-            if (!has_backend_strides) {
-              CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
-            }
-            code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-            code << "      readSliceQuadsStrided"
-                 << "3d<num_comp_in_" << i
-                 << ",Q_1d"
-                    ","
-                 << strides[0] << "," << strides[1] << "," << strides[2] << ">(data, elem, q, d_u_" << i << ", r_q_" << i << ");\n";
-          }
-          break;
-        case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
-          code << "      for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n";
-          code << "        r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1d];\n";
-          code << "      }\n";
-          break;
-        case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "*dim];\n";
-          code << "      gradCollo3d<num_comp_in_" << i << ",Q_1d>(data, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ");\n";
-          break;
-        case CEED_EVAL_WEIGHT:
-          code << "      CeedScalar r_q_" << i << "[1];\n";
-          code << "      r_q_" << i << "[0] = r_t_" << i << "[q];\n";
-          break;  // No action
+          code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+               << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+        }
+        break;
+      // LCOV_EXCL_START
+      case CEED_EVAL_WEIGHT:
+        break;  // Should not occur
+      case CEED_EVAL_DIV:
+      case CEED_EVAL_CURL:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
+    }
+  }
+  CeedCallBackend(CeedBasisDestroy(&basis));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// QFunction
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim,
+                                                    CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
+                                                    CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
+                                                    CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
+                                                    std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
+                                                    bool use_3d_slices, bool is_assemble) {
+  std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
+  CeedEvalMode        eval_mode = CEED_EVAL_NONE;
+  CeedElemRestriction elem_rstr;
+
+  // Setup output arrays
+  code << "\n";
+  code << tab << "// -- Output field setup\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    const char *field_name;
+    std::string var_suffix = "_out_" + std::to_string(i);
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        if (is_at_points) {
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
+        } else {
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
+               << "];\n";
+        }
+        break;
+      case CEED_EVAL_INTERP:
+        if (is_at_points) {
+          // Accumulator for point data
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
+               << "[i] = 0.0;\n";
+        } else {
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
+               << "];\n";
+        }
+        break;
+      case CEED_EVAL_GRAD:
+        if (is_at_points) {
+          // Accumulator for point data
+          code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
+          code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
+               << "[i] = 0.0;\n";
+        } else if (use_3d_slices) {
+          // Accumulator for gradient slices
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
+          code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
+        } else {
+          code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
+               << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
+        }
+        break;
+      case CEED_EVAL_WEIGHT:
+        break;
+        // LCOV_EXCL_START
+      case CEED_EVAL_DIV:
+      case CEED_EVAL_CURL:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
+    }
+  }
+
+  if (is_at_points) {
+    // We need to handle batches of points
+    code << "\n";
+    code << tab << "// Note: Using batches of points\n";
+    code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
+    code << tab << "#pragma unroll\n";
+    code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
+    tab.push();
+    code << tab << "const CeedInt p = i % max_num_points;\n\n";
+
+    code << tab << "// -- Coordinates\n";
+    code << tab << "CeedScalar r_x[max_dim];\n";
+    code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
+
+    code << tab << "// -- Input fields\n";
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      const char *field_name;
+      std::string var_suffix = "_in_" + std::to_string(i);
+      std::string P_name     = "P_1d" + var_suffix;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+      code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      // Basis action
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
+               << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_INTERP:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_WEIGHT:
+          code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
+          code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
+          break;
+          // LCOV_EXCL_START
         case CEED_EVAL_DIV:
-          break;  // TODO: Not implemented
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
-    code << "\n      // -- Output fields --\n";
+    code << "\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << " ----\n";
+      const char *field_name;
+      std::string var_suffix = "_out_" + std::to_string(i);
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
       switch (eval_mode) {
         case CEED_EVAL_NONE:
-          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
-          break;  // No action
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          break;
         case CEED_EVAL_INTERP:
-          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*dim];\n";
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
           break;
+          // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
         case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
+          break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+
+  } else if (use_3d_slices) {
+    // We treat quadrature points per slice in 3d to save registers
+    code << "\n";
+    code << tab << "// Note: Using planes of 3D elements\n";
+    code << tab << "#pragma unroll\n";
+    code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
+    tab.push();
+    code << tab << "// -- Input fields\n";
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      const char *field_name;
+      std::string var_suffix = "_in_" + std::to_string(i);
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+      code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      // Basis action
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          bool is_strided;
+
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+          CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
+          if (is_strided) {
+            bool    has_backend_strides;
+            CeedInt num_elem, elem_size;
+
+            CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+            CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
+            CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
+            CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+
+            if (!has_backend_strides) {
+              CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
+            }
+            code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
+                 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
+            code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
+                 << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
+          } else {
+            CeedSize                 l_size = 0;
+            CeedInt                  comp_stride;
+            CeedElemRestriction_Hip *rstr_data;
+
+            CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+            code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+            CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+            code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+            CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
+            data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
+            code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
+                 << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
+          }
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+          break;
+        case CEED_EVAL_INTERP:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
+          tab.push();
+          code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
+          tab.pop();
+          code << tab << "}\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
+               << var_suffix << ", r_s" << var_suffix << ");\n";
+          break;
+        case CEED_EVAL_WEIGHT:
+          code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
+          code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+    code << "\n";
+    code << tab << "// -- Output fields\n";
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      const char *field_name;
+      std::string var_suffix = "_out_" + std::to_string(i);
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Basis action
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          break;
+        case CEED_EVAL_INTERP:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_WEIGHT:
+          break;  // Should not occur
+        case CEED_EVAL_DIV:
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
   } else {
-    code << "\n      // Note: Using full elements\n";
-    code << "      // -- Input fields --\n";
+    code << "\n";
+    code << tab << "// Note: Using full elements\n";
+    code << tab << "{\n";
+    tab.push();
+    code << tab << "// -- Input fields\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      code << "      // ---- Input field " << i << " ----\n";
-      code << "      CeedScalar* r_q_" << i << " = r_t_" << i << ";\n";
+      const char *field_name;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+      code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
+      code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
     }
-    code << "      // -- Output fields --\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << " ----\n";
-      code << "      CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n";
+      const char *field_name;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
     }
   }
-  code << "\n      // -- QFunction Inputs and outputs --\n";
-  code << "      CeedScalar* in[" << num_input_fields << "];\n";
+
+  // Input and output buffers
+  code << "\n";
+  code << tab << "// -- QFunction inputs and outputs\n";
+  code << tab << "// ---- Inputs\n";
+  code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    code << "      // ---- Input field " << i << " ----\n";
-    code << "      in[" << i << "] = r_q_" << i << ";\n";
+    const char *field_name;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
+    code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
+    code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
   }
-  code << "      CeedScalar* out[" << num_output_fields << "];\n";
+  code << tab << "// ---- Outputs\n";
+  code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "      // ---- Output field " << i << " ----\n";
-    code << "      out[" << i << "] = r_qq_" << i << ";\n";
+    const char *field_name;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
+    code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
   }
-  code << "\n      // -- Apply QFunction --\n";
-  code << "      " << qfunction_name << "(ctx, ";
-  if (dim != 3 || use_collograd_parallelization) {
+
+  // Apply QFunction
+  code << "\n";
+  code << tab << "// -- Apply QFunction\n";
+#ifdef __HIP_PLATFORM_SPIRV__
+  code << tab << "if (elem < num_elem) {\n";
+  tab.push();
+#endif
+  code << tab << "" << qfunction_name << "(ctx, ";
+  if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
     code << "1";
   } else {
-    code << "Q_1d";
+    code << Q_name;
   }
-  code << ", in, out);\n";
-  if (use_collograd_parallelization) {
-    code << "      // -- Output fields --\n";
+  code << ", inputs, outputs);\n";
+#ifdef __HIP_PLATFORM_SPIRV__
+  tab.pop();
+  code << tab << "}\n";
+#endif
+
+  if (is_at_points) {
+    // Map back to coefficients
+    code << "\n";
+    code << tab << "// -- Output fields\n";
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      code << "      // ---- Output field " << i << " ----\n";
+      const char *field_name;
+      std::string var_suffix = "_out_" + std::to_string(i);
+      std::string P_name     = "P_1d" + var_suffix;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       // Basis action
-      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
       switch (eval_mode) {
-        case CEED_EVAL_NONE:
-          code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
-          code << "        r_tt_" << i << "[q + j*Q_1d] = r_qq_" << i << "[j];\n";
-          code << "      }\n";
-          break;  // No action
+        case CEED_EVAL_NONE: {
+          CeedInt             comp_stride;
+          CeedElemRestriction elem_rstr;
+
+          if (is_assemble) break;
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+          CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+          code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+          code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
+               << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
+               << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
+          break;
+        }
         case CEED_EVAL_INTERP:
-          code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
-          code << "        r_tt_" << i << "[q + j*Q_1d] = r_qq_" << i << "[j];\n";
-          code << "      }\n";
+          code << tab << "if (i >= points.num_per_elem[elem]) {\n";
+          tab.push();
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          tab.pop();
+          code << tab << "}\n";
+          code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      gradColloTranspose3d<num_comp_out_" << i << ",Q_1d>(data, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i << ");\n";
+          code << tab << "if (i >= points.num_per_elem[elem]) {\n";
+          tab.push();
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
+          tab.pop();
+          code << tab << "}\n";
+          code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
+               << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
           break;
+          // LCOV_EXCL_START
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
         case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
+      }
+    }
+  } else if (use_3d_slices) {
+    // Copy or apply transpose grad, if needed
+    code << "\n";
+    code << tab << "// -- Output fields\n";
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      const char *field_name;
+      std::string var_suffix = "_out_" + std::to_string(i);
+      std::string P_name     = "P_1d" + var_suffix;
+
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Basis action
+      code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
+          tab.push();
+          code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
+          tab.pop();
+          code << tab << "}\n";
+          break;
+        case CEED_EVAL_INTERP:
+          code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
+          tab.push();
+          code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
+          tab.pop();
+          code << tab << "}\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
+               << var_suffix << ", r_q" << var_suffix << ");\n";
+          break;
+          // LCOV_EXCL_START
+        case CEED_EVAL_WEIGHT:
+          break;  // Should not occur
+        case CEED_EVAL_DIV:
         case CEED_EVAL_CURL:
           break;  // TODO: Not implemented
+                  // LCOV_EXCL_STOP
       }
     }
-    code << "    }\n";
   }
+  tab.pop();
+  code << tab << "}\n";
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Output basis apply if needed
-  // Generate the correct eval mode code for each output
-  code << "\n    // -- Output field basis action and restrictions --\n";
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << "    // ---- Output field " << i << " ----\n";
-    // Get elem_size, eval_mode, num_comp
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-    // TODO put in a function
-    // Basis action
-    code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        code << "    CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n";
-        break;  // No action
-      case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
-        code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_out_" << i << ",P_out_" << i << ",Q_1d>(data, r_tt_" << i
-             << ", s_B_out_" << i << ", r_v_" << i << ");\n";
-        break;
-      case CEED_EVAL_GRAD:
-        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
-        if (use_collograd_parallelization) {
-          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp_out_" << i << ",P_out_" << i << ",Q_1d>(data, r_tt_" << i
-               << ", s_B_out_" << i << ", r_v_" << i << ");\n";
-        } else {
-          CeedInt P_1d;
-          CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-          CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-          code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp_out_" << i
-               << ",P_out_" << i << ",Q_1d>(data, r_tt_" << i << ", s_B_out_" << i << ", s_G_out_" << i << ", r_v_" << i << ");\n";
-        }
-        break;
-      // LCOV_EXCL_START
-      case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
-        break;  // Should not occur
-      }
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
-        break;  // Should not occur
+//------------------------------------------------------------------------------
+// Build single operator kernel
+//------------------------------------------------------------------------------
+extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
+  bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
+  Ceed                   ceed;
+  CeedInt                Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
+  CeedQFunctionField    *qf_input_fields, *qf_output_fields;
+  CeedQFunction_Hip_gen *qf_data;
+  CeedQFunction          qf;
+  CeedOperatorField     *op_input_fields, *op_output_fields;
+  CeedOperator_Hip_gen  *data;
+  std::ostringstream     code;
+  Tab                    tab;
+
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  {
+    bool is_setup_done;
+
+    CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
+    if (is_setup_done) {
+      *is_good_build = !data->use_fallback;
+      return CEED_ERROR_SUCCESS;
+    }
+  }
+
+  // Check field compatibility
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  {
+    bool has_shared_bases = true;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
       }
-        // LCOV_EXCL_STOP
+      CeedCallBackend(CeedBasisDestroy(&basis));
     }
-    // TODO put in a function
-    // Restriction
-    bool is_strided;
 
-    CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-    if (!is_strided) {
-      CeedInt comp_stride;
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedBasis basis;
 
-      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
-      code << "    const CeedInt l_size_out_" << i << " = " << l_size << ";\n";
-      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
-      code << "    // CompStride: " << comp_stride << "\n";
-      CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
-      data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
-      code << "    writeDofsOffset" << dim << "d<num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ">(data, l_size_out_" << i
-           << ", elem, indices.outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n";
-    } else {
-      bool    has_backend_strides;
-      CeedInt num_elem;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
 
-      CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
-      CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
-      CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
 
-      if (!has_backend_strides) {
-        CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
       }
-      code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
-      code << "    writeDofsStrided" << dim << "d<num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
-           << ">(data, elem, r_v_" << i << ", d_v_" << i << ");\n";
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+    // -- Fallback to ref if not all bases are shared
+    if (!has_shared_bases) {
+      *is_good_build = false;
+      return CEED_ERROR_SUCCESS;
+    }
+  }
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Get operator data
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  {
+    CeedInt max_P = 0, max_P_1d = 0;
+
+    CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
+                                                        qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
+    data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
+  }
+  if (is_at_points) {
+    CeedInt                  coords_dim = 0;
+    CeedElemRestriction_Hip *rstr_data;
+    CeedElemRestriction      rstr_points = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim));
+    CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
+    data->points.indices = (CeedInt *)rstr_data->d_offsets;
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+    if (max_dim == 0) max_dim = coords_dim;
+    if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim));
+  }
+  if (max_dim == 0) max_dim = 1;
+  data->dim = max_dim;
+  if (is_at_points) use_3d_slices = false;
+  if (Q_1d == 0) {
+    if (is_at_points) Q_1d = max_num_points;
+    else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
+  }
+  if (Q == 0) Q = Q_1d;
+  data->Q    = Q;
+  data->Q_1d = Q_1d;
+
+  // Check for restriction only identity operator
+  {
+    bool is_identity_qf;
+
+    CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
+    if (is_identity_qf) {
+      CeedEvalMode eval_mode_in, eval_mode_out;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
+      CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
+                "Backend does not implement restriction only identity operators");
     }
   }
 
-  code << "  }\n";
-  code << "}\n";
-  code << "// -----------------------------------------------------------------------------\n\n";
+  // Load basis source files
+  if (!is_all_nontensor) {
+    code << tab << "// Tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
+  }
+  if (!is_all_tensor) {
+    code << tab << "// Non-tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
+  }
+  if (is_at_points) {
+    code << tab << "// AtPoints basis source\n";
+    code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
+  }
+  if (!is_all_tensor && !is_all_nontensor) {
+    code << tab << "// Tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
+  }
+  code << tab << "// CodeGen operator source\n";
+  code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
 
-  // View kernel for debugging
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated Operator Kernels:\n");
-  CeedDebug(ceed, code.str().c_str());
+  // Get QFunction name
+  std::string qfunction_name(qf_data->qfunction_name);
+  std::string operator_name;
 
-  CeedInt block_sizes[3] = {0, 0, 0};
-  CeedInt num_elem;
+  operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
 
-  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(BlockGridCalculate_Hip_gen(dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
-  CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE",
-                                  block_sizes[0] * block_sizes[1] * block_sizes[2]));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
+  // Define CEED_Q_VLA
+  code << "\n" << tab << "#undef CEED_Q_VLA\n";
+  if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
+    code << tab << "#define CEED_Q_VLA 1\n\n";
+  } else {
+    code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+  }
 
-  CeedCallBackend(CeedOperatorSetSetupDone(op));
+  // Add user QFunction source
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
+
+    code << tab << "// User QFunction source\n";
+    code << tab << "#include \"" << source_path << "\"\n\n";
+  }
+
+  // Setup
+  code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "// Operator Kernel\n";
+  code << tab << "// \n";
+  code << tab << "// d_[in,out]_i:   CeedVector device array\n";
+  code << tab << "// r_[in,out]_e_i: Element vector register\n";
+  code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
+  code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
+  code << tab << "// \n";
+  code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
+  code << "__global__ void " << operator_name
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
+  tab.push();
+
+  // Scratch buffers
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+  }
+
+  code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
+  if (!is_all_tensor) {
+    code << tab << "const CeedInt Q = " << Q << ";\n";
+  }
+  if (!is_all_nontensor) {
+    code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
+  }
+  if (is_at_points) {
+    code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
+    code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
+  }
+
+  // Shared data
+  code << tab << "extern __shared__ CeedScalar slice[];\n";
+  code << tab << "SharedData_Hip data;\n";
+  code << tab << "data.t_id_x = threadIdx.x;\n";
+  code << tab << "data.t_id_y = threadIdx.y;\n";
+  code << tab << "data.t_id_z = threadIdx.z;\n";
+  code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+
+  // -- Determine input mat reuse
+  FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    input_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool         is_tensor = true;
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
+    if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
+    for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          input_matrix_reuse[i].index     = j;
+          input_matrix_reuse[i].is_input  = true;
+          input_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            input_matrix_reuse[i].index     = j;
+            input_matrix_reuse[i].is_input  = true;
+            input_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // -- Determine output mat reuse
+  FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    output_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool         is_tensor = true;
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = true;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = true;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = false;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = false;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // Initialize constants, and matrices B and G
+  code << "\n" << tab << "// Input field constants and basis data\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
+                                                             max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
+  }
+  code << "\n" << tab << "// Output field constants and basis data\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+                                                             max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
+  }
+
+  // Loop over all elements
+  code << "\n" << tab << "// Element loop\n";
+  code << tab << "__syncthreads();\n";
+#ifdef __HIP_PLATFORM_SPIRV__
+  code << tab << "CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z;\n";
+#else
+  code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  tab.push();
+#endif
+
+  // -- Compute minimum buffer space needed
+  CeedInt max_rstr_buffer_size = 1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  code << tab << "// Scratch restriction buffer space\n";
+  code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+
+  // -- Determine best input field processing order
+  CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    field_rstr_in_buffer[i] = -1;
+    input_field_order[i]    = -1;
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      field_rstr_in_buffer[i]       = i;
+      is_ordered[i]                 = true;
+      input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          field_rstr_in_buffer[j]       = i;
+          is_ordered[j]                 = true;
+          input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+
+  // -- Input restriction and basis
+  code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    // ---- Restriction
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                               max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                         is_all_tensor, is_at_points, use_3d_slices));
+  }
+
+  // -- Q function
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
+                                                           qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
+                                                           Q_1d, is_all_tensor, is_at_points, use_3d_slices, false));
+
+  // -- Output basis and restriction
+  code << "\n" << tab << "// -- Output field basis action and restrictions\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    const char *field_name;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
+                                                         is_all_tensor, is_at_points, use_3d_slices));
+
+    // ---- Restriction
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
+                                                               false, is_all_tensor, is_at_points, use_3d_slices));
+  }
+
+  // Close loop and function
+#ifndef __HIP_PLATFORM_SPIRV__
+  tab.pop();
+  code << tab << "}\n";
+#endif
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "// -----------------------------------------------------------------------------\n\n";
+
+  CeedInt block_sizes[3] = {0, 0, 0};
+  CeedInt num_elem;
+
+  // Compile
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
+  {
+    bool is_compile_good = false;
+
+    data->thread_1d = block_sizes[0];
+    CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
+                                       block_sizes[0] * block_sizes[1] * block_sizes[2]));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
+    } else {
+      *is_good_build     = false;
+      data->use_fallback = true;
+    }
+  }
+  CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Build AtPoints assembly operator kernel
+//------------------------------------------------------------------------------
+static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) {
+  bool                   is_all_tensor = true, is_at_points = false, use_3d_slices = false;
+  Ceed                   ceed;
+  CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
+  CeedQFunctionField    *qf_input_fields, *qf_output_fields;
+  CeedQFunction_Hip_gen *qf_data;
+  CeedQFunction          qf;
+  CeedOperatorField     *op_input_fields, *op_output_fields;
+  CeedOperator_Hip_gen  *data;
+  std::ostringstream     code;
+  Tab                    tab;
+
+  // Check compatibility
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
+
+  // Retrieve operator data
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  Q       = data->Q;
+  Q_1d    = data->Q_1d;
+  max_dim = data->dim;
+  {
+    CeedElemRestriction rstr_points = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+
+  // Load basis source files
+  code << tab << "// Tensor basis source\n";
+  code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
+  code << tab << "// AtPoints basis source\n";
+  code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
+  code << tab << "// CodeGen operator source\n";
+  code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
+
+  // Get QFunction name
+  std::string qfunction_name(qf_data->qfunction_name);
+  std::string operator_name;
+
+  if (is_full) {
+    operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name;
+  } else {
+    operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name;
+  }
+
+  // Define CEED_Q_VLA
+  code << "\n" << tab << "#undef CEED_Q_VLA\n";
+  code << tab << "#define CEED_Q_VLA 1\n\n";
+
+  // Add user QFunction source
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
+
+    code << tab << "// User QFunction source\n";
+    code << tab << "#include \"" << source_path << "\"\n\n";
+  }
+
+  // Setup
+  code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "// Operator Assembly Kernel\n";
+  code << tab << "// \n";
+  code << tab << "// d_[in,out]_i:   CeedVector device array\n";
+  code << tab << "// r_[in,out]_e_i: Element vector register\n";
+  code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
+  code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
+  code << tab << "// \n";
+  code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "extern \"C\" __global__ void " << operator_name
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
+          "points, CeedScalar *__restrict__ values_array) {\n";
+  tab.push();
+
+  // Scratch buffers
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+  }
+
+  code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
+  code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
+  code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
+  code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
+
+  // Shared data
+  code << tab << "extern __shared__ CeedScalar slice[];\n";
+  code << tab << "SharedData_Hip data;\n";
+  code << tab << "data.t_id_x = threadIdx.x;\n";
+  code << tab << "data.t_id_y = threadIdx.y;\n";
+  code << tab << "data.t_id_z = threadIdx.z;\n";
+  code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+
+  // -- Determine input mat reuse
+  FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    input_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
+    if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        input_matrix_reuse[i].index     = j;
+        input_matrix_reuse[i].is_input  = true;
+        input_matrix_reuse[i].eval_mode = eval_mode_j;
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // -- Determine output mat reuse
+  FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    output_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        output_matrix_reuse[i].index     = j;
+        output_matrix_reuse[i].is_input  = true;
+        output_matrix_reuse[i].eval_mode = eval_mode_j;
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        output_matrix_reuse[i].index     = j;
+        output_matrix_reuse[i].is_input  = false;
+        output_matrix_reuse[i].eval_mode = eval_mode_j;
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // Initialize constants, and matrices B and G
+  code << "\n" << tab << "// Input field constants and basis data\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
+                                                             max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
+  }
+  code << "\n" << tab << "// Output field constants and basis data\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+                                                             max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
+  }
+
+  // Loop over all elements
+  code << "\n" << tab << "// Element loop\n";
+  code << tab << "__syncthreads();\n";
+#ifdef __HIP_PLATFORM_SPIRV__
+  code << tab << "CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z;\n";
+#else
+  code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  tab.push();
+#endif
+
+  // -- Compute minimum buffer space needed
+  CeedInt max_rstr_buffer_size = 1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  code << tab << "// Scratch restriction buffer space\n";
+  code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+
+  // -- Determine best input field processing order
+  CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    field_rstr_in_buffer[i] = -1;
+    input_field_order[i]    = -1;
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      field_rstr_in_buffer[i]       = i;
+      is_ordered[i]                 = true;
+      input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          field_rstr_in_buffer[j]       = i;
+          is_ordered[j]                 = true;
+          input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+
+  // -- Input restriction and basis
+  code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
+  CeedInt active_field_index = -1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool          is_active = false;
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    if (is_active) {
+      std::string var_suffix = "_in_" + std::to_string(f);
+
+      code << tab << "// Active field - no restriction or basis action here\n";
+      if (active_field_index == -1) {
+        active_field_index = f;
+        code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
+             << "] = {0.0};\n";
+      } else {
+        code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
+      }
+    } else {
+      // ---- Restriction
+      CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                                 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+
+      // ---- Basis action
+      CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                           is_all_tensor, is_at_points, use_3d_slices));
+    }
+  }
+
+  // -- Loop over active field
+  std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
+
+  code << "\n" << tab << "// Loop over nodes in active field\n";
+  code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
+       << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
+  tab.push();
+
+  // -- Set current active node and component to 1
+  code << tab << "// Set current active node and component to 1.0\n";
+  code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
+       << active_var_suffix << ");\n\n";
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool          is_active = false;
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (!is_active) continue;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                         is_all_tensor, is_at_points, use_3d_slices));
+  }
+
+  // -- Q function
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
+                                                           qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
+                                                           Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
+
+  // -- Output basis and restriction
+  code << "\n" << tab << "// -- Output field basis action and restrictions\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool        is_active = false;
+    const char *field_name;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (!is_active) continue;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+
+    // ---- Basis action
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
+                                                         is_all_tensor, is_at_points, use_3d_slices));
+
+    // ---- Restriction
+    if (is_full) {
+      std::string         var_suffix = "_out_" + std::to_string(i);
+      CeedInt             comp_stride;
+      CeedSize            l_size;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+      code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+      code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+      code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
+           << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    } else {
+      std::string         var_suffix = "_out_" + std::to_string(i);
+      CeedInt             comp_stride;
+      CeedSize            l_size;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
+      code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
+      CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
+      code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
+      code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
+           << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+
+  // -- Reset current active node and component
+  code << "\n" << tab << "// Reset current active node and component to 0.0\n";
+  code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
+       << active_var_suffix << ");\n";
+
+  // -- End of loop over active field
+  tab.pop();
+  code << tab << "}\n";
+
+  // Close loop and function
+#ifndef __HIP_PLATFORM_SPIRV__
+  tab.pop();
+  code << tab << "}\n";
+#endif
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "// -----------------------------------------------------------------------------\n\n";
+
+  CeedInt block_sizes[3] = {0, 0, 0};
+  CeedInt num_elem;
+
+  // Compile
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
+  {
+    bool is_compile_good = false;
+
+    data->thread_1d = block_sizes[0];
+    CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good,
+                                       is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0],
+                                       "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
+                                        is_full ? &data->assemble_full : &data->assemble_diagonal));
+    } else {
+      *is_good_build              = false;
+      data->use_assembly_fallback = true;
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
+
+extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
+  return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
+}
+
+extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
+  return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build);
+}
+//------------------------------------------------------------------------------
+// Build QFunction assembly operator kernel
+//------------------------------------------------------------------------------
+extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) {
+  bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
+  Ceed                   ceed;
+  CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0;
+  CeedQFunctionField    *qf_input_fields, *qf_output_fields;
+  CeedQFunction_Hip_gen *qf_data;
+  CeedQFunction          qf;
+  CeedOperatorField     *op_input_fields, *op_output_fields;
+  CeedOperator_Hip_gen  *data;
+  std::ostringstream     code;
+  Tab                    tab;
+
+  // Check compatibility
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported");
+
+  // Check field compatibility
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  {
+    bool has_shared_bases = true;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
+
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+  }
+
+  // Retrieve operator data
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+  Q       = data->Q;
+  Q_1d    = data->Q_1d;
+  max_dim = data->dim;
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Load basis source files
+  if (!is_all_nontensor) {
+    code << tab << "// Tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
+  }
+  if (!is_all_tensor) {
+    code << tab << "// Non-tensor basis source\n";
+    code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
+  }
+  if (!is_all_tensor && !is_all_nontensor) {
+    code << "// Tensor basis source\n";
+    code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
+  }
+  code << "// CodeGen operator source\n";
+  code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
+
+  // Get QFunction name
+  std::string qfunction_name(qf_data->qfunction_name);
+  std::string operator_name;
+
+  operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name;
+
+  // Define CEED_Q_VLA
+  code << "\n" << tab << "#undef CEED_Q_VLA\n";
+  if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
+    code << tab << "#define CEED_Q_VLA 1\n\n";
+  } else {
+    code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+  }
+
+  // Add user QFunction source
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
+
+    code << tab << "// User QFunction source\n";
+    code << tab << "#include \"" << source_path << "\"\n\n";
+  }
+
+  // Setup
+  code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "// Operator Assembly Kernel\n";
+  code << tab << "// \n";
+  code << tab << "// d_[in,out]_i:   CeedVector device array\n";
+  code << tab << "// r_[in,out]_e_i: Element vector register\n";
+  code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
+  code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
+  code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
+  code << tab << "// \n";
+  code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
+  code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
+  code << tab << "// -----------------------------------------------------------------------------\n";
+  code << tab << "extern \"C\" __global__ void " << operator_name
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
+          "points, CeedScalar *__restrict__ values_array) {\n";
+  tab.push();
+
+  // Scratch buffers
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool is_active = false;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (is_active) {
+      code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    }
+  }
+
+  code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
+  if (!is_all_tensor) {
+    code << tab << "const CeedInt Q = " << Q << ";\n";
+  }
+  if (!is_all_nontensor) {
+    code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
+  }
+
+  // Shared data
+  code << tab << "extern __shared__ CeedScalar slice[];\n";
+  code << tab << "SharedData_Hip data;\n";
+  code << tab << "data.t_id_x = threadIdx.x;\n";
+  code << tab << "data.t_id_y = threadIdx.y;\n";
+  code << tab << "data.t_id_z = threadIdx.z;\n";
+  code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
+  code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
+
+  // -- Determine input mat reuse
+  FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    input_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool         is_tensor = true;
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
+    if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
+    for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          input_matrix_reuse[i].index     = j;
+          input_matrix_reuse[i].is_input  = true;
+          input_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            input_matrix_reuse[i].index     = j;
+            input_matrix_reuse[i].is_input  = true;
+            input_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // -- Determine output mat reuse
+  FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    output_matrix_reuse[i].index = -1;
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool         is_tensor = true;
+    CeedEvalMode eval_mode_i;
+    CeedBasis    basis_i;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
+    CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = true;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = true;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
+      CeedEvalMode eval_mode_j;
+      CeedBasis    basis_j;
+
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
+      if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
+      if (basis_i == basis_j) {
+        if (is_tensor) {
+          output_matrix_reuse[i].index     = j;
+          output_matrix_reuse[i].is_input  = false;
+          output_matrix_reuse[i].eval_mode = eval_mode_j;
+        } else {
+          // For non-tensor can only re-use with the same eval mode
+          if (eval_mode_i == eval_mode_j) {
+            output_matrix_reuse[i].index     = j;
+            output_matrix_reuse[i].is_input  = false;
+            output_matrix_reuse[i].eval_mode = eval_mode_j;
+          }
+        }
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis_j));
+    }
+    CeedCallBackend(CeedBasisDestroy(&basis_i));
+  }
+
+  // Initialize constants, and matrices B and G
+  code << "\n" << tab << "// Input field constants and basis data\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
+                                                             max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true));
+  }
+  code << "\n" << tab << "// Output field constants and basis data\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
+                                                             max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true));
+  }
+
+  // Loop over all elements
+  code << "\n" << tab << "// Element loop\n";
+  code << tab << "__syncthreads();\n";
+#ifdef __HIP_PLATFORM_SPIRV__
+  code << tab << "CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z;\n";
+#else
+  code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
+  tab.push();
+#endif
+
+  // -- Compute minimum buffer space needed
+  CeedInt max_rstr_buffer_size = 1;
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_NONE) {
+      CeedInt             num_comp;
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+      max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+  }
+  code << tab << "// Scratch restriction buffer space\n";
+  code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
+
+  // -- Determine best input field processing order
+  CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    field_rstr_in_buffer[i] = -1;
+    input_field_order[i]    = -1;
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      field_rstr_in_buffer[i]       = i;
+      is_ordered[i]                 = true;
+      input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          field_rstr_in_buffer[j]       = i;
+          is_ordered[j]                 = true;
+          input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+
+  // -- Input restriction and basis
+  code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
+  CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0;
+  CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX];
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool          is_active = false;
+    const char   *field_name;
+    const CeedInt f = input_field_order[i];
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
+    code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
+
+    if (is_active) {
+      CeedEvalMode eval_mode;
+      CeedInt      field_size;
+
+      active_fields_in[num_active_in] = f;
+      num_active_in++;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
+      if (eval_mode == CEED_EVAL_GRAD) {
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
+             << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+      } else {
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+      }
+      code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n";
+    } else {
+      // ---- Restriction
+      CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
+                                                                 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
+
+      // ---- Basis action
+      CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
+                                                           is_all_tensor, is_at_points, use_3d_slices));
+    }
+  }
+  code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {";
+  for (CeedInt i = 0; i < num_active_in; i++) {
+    code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
+  }
+  code << "};\n";
+  code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {";
+  for (CeedInt i = 0; i < num_active_in; i++) {
+    code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
+  }
+  code << "};\n";
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool is_active = false;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (is_active) {
+      const char *field_name;
+      CeedInt     field_size;
+
+      active_fields_out[num_active_out] = i;
+      num_active_out++;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+      qf_assembly_size_out += field_size;
+      CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+      code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+      code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n";
+    }
+  }
+  code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {";
+  for (CeedInt i = 0; i < num_active_out; i++) {
+    code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : "");
+  }
+  code << "};\n";
+  code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n";
+
+  // -- Loop over active field
+  code << "\n" << tab << "CeedInt input_offset = 0;\n";
+  code << tab << "// Loop over active QFunction input fields\n";
+  code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n";
+  code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n";
+  tab.push();
+
+  // -- Loop over size of active field
+  code << "\n" << tab << "// Loop over current active input field size\n";
+  code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n";
+  code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n";
+  tab.push();
+
+  // -- Set current active point and component to 1
+  code << tab << "// Set current active point and component to 1.0\n";
+  if (is_all_tensor && (max_dim >= 3)) {
+    code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n";
+  } else {
+    code << tab << "r_q_in[a][s] = 1.0;\n";
+  }
+
+  // -- Q function
+  CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
+                                                           qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
+                                                           Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
+
+  // -- Output basis and restriction
+  code << "\n" << tab << "// -- Output field basis action and restrictions\n";
+  CeedScalar offset = 0;
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool        is_active = false;
+    const char *field_name;
+
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+    if (!is_active) continue;
+
+    CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
+    code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
+
+    // ---- Restriction
+    CeedInt field_size;
+
+    code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
+         << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
+    CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+    offset += field_size;
+  }
+
+  // -- Reset current active node and component
+  code << "\n" << tab << "// Reset current active node and component to 0.0\n";
+  if (is_all_tensor && (max_dim >= 3)) {
+    code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n";
+  } else {
+    code << tab << "r_q_in[a][s] = 0.0;\n";
+  }
+
+  // -- End of loop over size of active field
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "input_offset += field_size_in;\n";
+
+  // -- End of loop over active field
+  tab.pop();
+  code << tab << "}\n";
+
+  // Close loop and function
+#ifndef __HIP_PLATFORM_SPIRV__
+  tab.pop();
+  code << tab << "}\n";
+#endif
+  tab.pop();
+  code << tab << "}\n";
+  code << tab << "// -----------------------------------------------------------------------------\n\n";
+
+  CeedInt block_sizes[3] = {0, 0, 0};
+  CeedInt num_elem;
+
+  // Compile
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
+  {
+    bool is_compile_good = false;
+
+    data->thread_1d = block_sizes[0];
+    CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0],
+                                       "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
+    } else {
+      *is_good_build              = false;
+      data->use_assembly_fallback = true;
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.h b/backends/hip-gen/ceed-hip-gen-operator-build.h
index c17ba46eeb..0bb7f20df3 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.h
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,4 +7,7 @@
 #pragma once
 
 CEED_INTERN int BlockGridCalculate_Hip_gen(CeedInt dim, CeedInt num_elem, CeedInt P_1d, CeedInt Q_1d, CeedInt *block_sizes);
-CEED_INTERN int CeedOperatorBuildKernel_Hip_gen(CeedOperator op);
+CEED_INTERN int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build);
+CEED_INTERN int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build);
+CEED_INTERN int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build);
+CEED_INTERN int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build);
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index fcd58ed76d..7532ba55b6 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -9,6 +9,7 @@
 #include <ceed/backend.h>
 #include <ceed/jit-source/hip/hip-types.h>
 #include <stddef.h>
+#include <hip/hiprtc.h>
 
 #include "../hip/ceed-hip-common.h"
 #include "../hip/ceed-hip-compile.h"
@@ -19,27 +20,51 @@
 // Destroy operator
 //------------------------------------------------------------------------------
 static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
+  Ceed                  ceed;
   CeedOperator_Hip_gen *impl;
+  bool                  is_composite;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorIsComposite(op, &is_composite));
+  if (is_composite) {
+    CeedInt num_suboperators;
+
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    for (CeedInt i = 0; i < num_suboperators; i++) {
+      if (impl->streams[i]) CeedCallHip(ceed, hipStreamDestroy(impl->streams[i]));
+      impl->streams[i] = NULL;
+    }
+  }
+  if (impl->module) CeedCallHip(ceed, hipModuleUnload(impl->module));
+  if (impl->module_assemble_full) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_full));
+  if (impl->module_assemble_diagonal) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_diagonal));
+  if (impl->module_assemble_qfunction) CeedCallHip(ceed, hipModuleUnload(impl->module_assemble_qfunction));
+  if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Apply and add to output
 //------------------------------------------------------------------------------
-static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream, const CeedScalar *input_arr, CeedScalar *output_arr,
+                                            bool *is_run_good, CeedRequest *request) {
+  bool                   is_at_points, is_tensor;
   Ceed                   ceed;
   CeedInt                num_elem, num_input_fields, num_output_fields;
   CeedEvalMode           eval_mode;
-  CeedVector             output_vecs[CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
   CeedQFunction_Hip_gen *qf_data;
   CeedQFunction          qf;
   CeedOperatorField     *op_input_fields, *op_output_fields;
   CeedOperator_Hip_gen  *data;
 
+  // Creation of the operator
+  CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, is_run_good));
+  if (!(*is_run_good)) return CEED_ERROR_SUCCESS;
+
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &data));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
@@ -48,37 +73,21 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
-  // Check for tensor-product bases
-  {
-    bool has_tensor_bases;
-
-    CeedCallBackend(CeedOperatorHasTensorBases(op, &has_tensor_bases));
-    // -- Fallback to ref if not all bases are tensor-product
-    if (!has_tensor_bases) {
-      CeedOperator op_fallback;
-
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to non-tensor bases");
-      CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
-      CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
-      return CEED_ERROR_SUCCESS;
-    }
-  }
-
-  // Creation of the operator
-  CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op));
-
   // Input vectors
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
       data->fields.inputs[i] = NULL;
     } else {
+      bool       is_active;
       CeedVector vec;
 
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
-      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) data->fields.inputs[i] = input_arr;
+      else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -88,25 +97,48 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
       data->fields.outputs[i] = NULL;
     } else {
+      bool       is_active;
       CeedVector vec;
 
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
-      output_vecs[i] = vec;
-      // Check for multiple output modes
-      CeedInt index = -1;
-      for (CeedInt j = 0; j < i; j++) {
-        if (vec == output_vecs[j]) {
-          index = j;
-          break;
-        }
-      }
-      if (index == -1) {
-        CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
-      } else {
-        data->fields.outputs[i] = data->fields.outputs[index];
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) data->fields.outputs[i] = output_arr;
+      else CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+  }
+
+  // Point coordinates, if needed
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  if (is_at_points) {
+    // Coords
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+    CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+    CeedCallBackend(CeedVectorDestroy(&vec));
+
+    // Points per elem
+    if (num_elem != data->points.num_elem) {
+      CeedInt            *points_per_elem;
+      const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+      CeedElemRestriction rstr_points = NULL;
+
+      data->points.num_elem = num_elem;
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+      CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+      for (CeedInt e = 0; e < num_elem; e++) {
+        CeedInt num_points_elem;
+
+        CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+        points_per_elem[e] = num_points_elem;
       }
+      if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
+      CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
+      CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+      CeedCallBackend(CeedFree(&points_per_elem));
     }
   }
 
@@ -114,29 +146,37 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
 
   // Apply operator
-  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W};
-  const CeedInt dim       = data->dim;
-  const CeedInt Q_1d      = data->Q_1d;
-  const CeedInt P_1d      = data->max_P_1d;
-  const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
-  CeedInt       block_sizes[3];
-
-  CeedCallBackend(BlockGridCalculate_Hip_gen(dim, num_elem, P_1d, Q_1d, block_sizes));
-  if (dim == 1) {
+  void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
+
+  CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
+  CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
+
+  if (is_tensor) {
+    CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
+  } else {
+    CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
+
+    elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
+    block_sizes[2]  = elems_per_block;
+  }
+  if (data->dim == 1 || !is_tensor) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
-    CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);
+    CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
 
-    CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
-  } else if (dim == 2) {
+    CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
+                                                  is_run_good, opargs));
+  } else if (data->dim == 2) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
-    CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
+    CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
 
-    CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
-  } else if (dim == 3) {
+    CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
+                                                  is_run_good, opargs));
+  } else if (data->dim == 3) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
-    CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
+    CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
 
-    CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+    CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
+                                                  is_run_good, opargs));
   }
 
   // Restore input arrays
@@ -144,11 +184,13 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
-      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -157,26 +199,675 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
-      // Check for multiple output modes
-      CeedInt index = -1;
-      for (CeedInt j = 0; j < i; j++) {
-        if (vec == output_vecs[j]) {
-          index = j;
-          break;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (!is_active) CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+  }
+
+  // Restore point coordinates, if needed
+  if (is_at_points) {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+    CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+    CeedCallBackend(CeedVectorDestroy(&vec));
+  }
+
+  // Restore context data
+  CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+  // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  if (!(*is_run_good)) data->use_fallback = true;
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+  bool              is_run_good = false;
+  const CeedScalar *input_arr   = NULL;
+  CeedScalar       *output_arr  = NULL;
+
+  // Try to run kernel
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
+  CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(op, NULL, input_arr, output_arr, &is_run_good, request));
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
+
+  // Fallback on unsuccessful run
+  if (!is_run_good) {
+    CeedOperator op_fallback;
+
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for ApplyAdd\n");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+  bool                  is_run_good[CEED_COMPOSITE_MAX] = {false}, is_sequential;
+  CeedInt               num_suboperators;
+  const CeedScalar     *input_arr  = NULL;
+  CeedScalar           *output_arr = NULL;
+  Ceed                  ceed;
+  CeedOperator_Hip_gen *impl;
+  CeedOperator         *sub_operators;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+  CeedCallBackend(CeedOperatorCompositeGetSubList(op, &sub_operators));
+  CeedCall(CeedOperatorCompositeIsSequential(op, &is_sequential));
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
+  for (CeedInt i = 0; i < num_suboperators; i++) {
+    CeedInt       num_elem     = 0;
+    const CeedInt stream_index = is_sequential ? 0 : i;
+
+    CeedCallBackend(CeedOperatorGetNumElements(sub_operators[i], &num_elem));
+    if (num_elem > 0) {
+      if (!impl->streams[stream_index]) CeedCallHip(ceed, hipStreamCreate(&impl->streams[stream_index]));
+      CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], impl->streams[stream_index], input_arr, output_arr, &is_run_good[i],
+                                                       request));
+    } else {
+      is_run_good[i] = true;
+    }
+  }
+  if (is_sequential) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[0]));
+  else {
+    for (CeedInt i = 0; i < num_suboperators; i++) {
+      if (impl->streams[i]) {
+        if (is_run_good[i]) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[i]));
+      }
+    }
+  }
+  if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
+  if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
+  CeedCallHip(ceed, hipDeviceSynchronize());
+
+  // Fallback on unsuccessful run
+  for (CeedInt i = 0; i < num_suboperators; i++) {
+    if (!is_run_good[i]) {
+      CeedOperator op_fallback;
+
+      CeedDebug(ceed, "\nFalling back to /gpu/hip/ref CeedOperator for ApplyAdd\n");
+      CeedCallBackend(CeedOperatorGetFallback(sub_operators[i], &op_fallback));
+      CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// QFunction assembly
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
+                                                           CeedRequest *request) {
+  Ceed                  ceed;
+  CeedOperator_Hip_gen *data;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+
+  // Build the assembly kernel
+  if (!data->assemble_qfunction && !data->use_assembly_fallback) {
+    bool is_build_good = false;
+
+    CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
+    if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(op, &is_build_good));
+    if (!is_build_good) data->use_assembly_fallback = true;
+  }
+
+  // Try assembly
+  if (!data->use_assembly_fallback) {
+    bool                   is_run_good = true;
+    Ceed_Hip              *hip_data;
+    CeedInt                num_elem, num_input_fields, num_output_fields;
+    CeedEvalMode           eval_mode;
+    CeedScalar            *assembled_array;
+    CeedQFunctionField    *qf_input_fields, *qf_output_fields;
+    CeedQFunction_Hip_gen *qf_data;
+    CeedQFunction          qf;
+    CeedOperatorField     *op_input_fields, *op_output_fields;
+
+    CeedCallBackend(CeedGetData(ceed, &hip_data));
+    CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+    CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+    CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+    CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+    CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+    // Input vectors
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+        data->fields.inputs[i] = NULL;
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        // Get input vector
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) data->fields.inputs[i] = NULL;
+        else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Get context data
+    CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
+
+    // Build objects if needed
+    if (build_objects) {
+      CeedInt qf_size_in = 0, qf_size_out = 0, Q;
+
+      // Count number of active input fields
+      {
+        for (CeedInt i = 0; i < num_input_fields; i++) {
+          CeedInt    field_size;
+          CeedVector vec;
+
+          // Get input vector
+          CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+          // Check if active input
+          if (vec == CEED_VECTOR_ACTIVE) {
+            CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
+            qf_size_in += field_size;
+          }
+          CeedCallBackend(CeedVectorDestroy(&vec));
+        }
+        CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+      }
+
+      // Count number of active output fields
+      {
+        for (CeedInt i = 0; i < num_output_fields; i++) {
+          CeedInt    field_size;
+          CeedVector vec;
+
+          // Get output vector
+          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+          // Check if active output
+          if (vec == CEED_VECTOR_ACTIVE) {
+            CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+            qf_size_out += field_size;
+          }
+          CeedCallBackend(CeedVectorDestroy(&vec));
         }
+        CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+      }
+      CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+
+      // Actually build objects now
+      const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
+      CeedInt        strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
+
+      // Create output restriction
+      CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out,
+                                                       (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides,
+                                                       rstr));
+      // Create assembled vector
+      CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
+    }
+
+    // Assembly array
+    CeedCallBackend(CeedVectorGetArrayWrite(*assembled, CEED_MEM_DEVICE, &assembled_array));
+
+    // Assemble QFunction
+    bool  is_tensor = false;
+    void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
+
+    CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
+    CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
+
+    if (is_tensor) {
+      CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
+    } else {
+      CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
+
+      elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
+      block_sizes[2]  = elems_per_block;
+    }
+    if (data->dim == 1 || !is_tensor) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
+                                                    sharedMem, &is_run_good, opargs));
+    } else if (data->dim == 2) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
+                                                    sharedMem, &is_run_good, opargs));
+    } else if (data->dim == 3) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_qfunction, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
+                                                    sharedMem, &is_run_good, opargs));
+    }
+
+    // Restore input arrays
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
       }
-      if (index == -1) {
-        CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i]));
+    }
+
+    // Restore context data
+    CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+    // Restore assembly array
+    CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
+
+    // Cleanup
+    CeedCallBackend(CeedQFunctionDestroy(&qf));
+    if (!is_run_good) {
+      data->use_assembly_fallback = true;
+      if (build_objects) {
+        CeedCallBackend(CeedVectorDestroy(assembled));
+        CeedCallBackend(CeedElemRestrictionDestroy(rstr));
       }
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
 
-  // Restore context data
-  CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+  // Fallback, if needed
+  if (data->use_assembly_fallback) {
+    CeedOperator op_fallback;
+
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LinearAssembleQFunction\n");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
+    return CEED_ERROR_SUCCESS;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedOperatorLinearAssembleQFunction_Hip_gen(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionCore_Hip_gen(op, true, assembled, rstr, request);
+}
+
+static int CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionCore_Hip_gen(op, false, &assembled, &rstr, request);
+}
+
+//------------------------------------------------------------------------------
+// AtPoints diagonal assembly
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen(CeedOperator op, CeedVector assembled, CeedRequest *request) {
+  Ceed                  ceed;
+  CeedOperator_Hip_gen *data;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+
+  // Build the assembly kernel
+  if (!data->assemble_diagonal && !data->use_assembly_fallback) {
+    bool                     is_build_good = false;
+    CeedInt                  num_active_bases_in, num_active_bases_out;
+    CeedOperatorAssemblyData assembly_data;
+
+    CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
+    CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL,
+                                                         NULL, NULL));
+    if (num_active_bases_in == num_active_bases_out) {
+      CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
+      if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(op, &is_build_good));
+    }
+    if (!is_build_good) data->use_assembly_fallback = true;
+  }
+
+  // Try assembly
+  if (!data->use_assembly_fallback) {
+    bool                   is_run_good = true;
+    Ceed_Hip              *hip_data;
+    CeedInt                num_elem, num_input_fields, num_output_fields;
+    CeedEvalMode           eval_mode;
+    CeedScalar            *assembled_array;
+    CeedQFunctionField    *qf_input_fields, *qf_output_fields;
+    CeedQFunction_Hip_gen *qf_data;
+    CeedQFunction          qf;
+    CeedOperatorField     *op_input_fields, *op_output_fields;
+
+    CeedCallBackend(CeedGetData(ceed, &hip_data));
+    CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+    CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+    CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+    CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+    CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+    // Input vectors
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+        data->fields.inputs[i] = NULL;
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        // Get input vector
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) data->fields.inputs[i] = NULL;
+        else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+
+      // Points per elem
+      if (num_elem != data->points.num_elem) {
+        CeedInt            *points_per_elem;
+        const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+        CeedElemRestriction rstr_points = NULL;
+
+        data->points.num_elem = num_elem;
+        CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+        CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+        for (CeedInt e = 0; e < num_elem; e++) {
+          CeedInt num_points_elem;
+
+          CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+          points_per_elem[e] = num_points_elem;
+        }
+        if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
+        CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
+        CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+        CeedCallBackend(CeedFree(&points_per_elem));
+      }
+    }
+
+    // Get context data
+    CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
+
+    // Assembly array
+    CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
+
+    // Assemble diagonal
+    void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points, &assembled_array};
+
+    CeedInt block_sizes[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
+
+    CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
+    block_sizes[2] = 1;
+    if (data->dim == 1) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
+                                                    sharedMem, &is_run_good, opargs));
+    } else if (data->dim == 2) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
+                                                    sharedMem, &is_run_good, opargs));
+    } else if (data->dim == 3) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_diagonal, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2],
+                                                    sharedMem, &is_run_good, opargs));
+    }
+    CeedCallHip(ceed, hipDeviceSynchronize());
+
+    // Restore input arrays
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Restore point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    // Restore context data
+    CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+    // Restore assembly array
+    CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
+
+    // Cleanup
+    CeedCallBackend(CeedQFunctionDestroy(&qf));
+    if (!is_run_good) data->use_assembly_fallback = true;
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+
+  // Fallback, if needed
+  if (data->use_assembly_fallback) {
+    CeedOperator op_fallback;
+
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints LinearAssembleAddDiagonal\n");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request));
+    return CEED_ERROR_SUCCESS;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// AtPoints full assembly
+//------------------------------------------------------------------------------
+static int CeedOperatorAssembleSingleAtPoints_Hip_gen(CeedOperator op, CeedInt offset, CeedVector assembled) {
+  Ceed                  ceed;
+  CeedOperator_Hip_gen *data;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &data));
+
+  // Build the assembly kernel
+  if (!data->assemble_full && !data->use_assembly_fallback) {
+    bool                     is_build_good = false;
+    CeedInt                  num_active_bases_in, num_active_bases_out;
+    CeedOperatorAssemblyData assembly_data;
+
+    CeedCallBackend(CeedOperatorGetOperatorAssemblyData(op, &assembly_data));
+    CeedCallBackend(CeedOperatorAssemblyDataGetEvalModes(assembly_data, &num_active_bases_in, NULL, NULL, NULL, &num_active_bases_out, NULL, NULL,
+                                                         NULL, NULL));
+    if (num_active_bases_in == num_active_bases_out) {
+      CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_build_good));
+      if (is_build_good) CeedCallBackend(CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(op, &is_build_good));
+    }
+    if (!is_build_good) {
+      CeedDebug(ceed, "Single Operator Assemble at Points compile failed, using fallback\n");
+      data->use_assembly_fallback = true;
+    }
+  }
+
+  // Try assembly
+  if (!data->use_assembly_fallback) {
+    bool                   is_run_good = true;
+    Ceed_Hip              *Hip_data;
+    CeedInt                num_elem, num_input_fields, num_output_fields;
+    CeedEvalMode           eval_mode;
+    CeedScalar            *assembled_array;
+    CeedQFunctionField    *qf_input_fields, *qf_output_fields;
+    CeedQFunction_Hip_gen *qf_data;
+    CeedQFunction          qf;
+    CeedOperatorField     *op_input_fields, *op_output_fields;
+
+    CeedCallBackend(CeedGetData(ceed, &Hip_data));
+    CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+    CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
+    CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+    CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+    CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+    CeedDebug(ceed, "Running single operator assemble for /gpu/hip/gen\n");
+
+    // Input vectors
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+        data->fields.inputs[i] = NULL;
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        // Get input vector
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (is_active) data->fields.inputs[i] = NULL;
+        else CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+
+      // Points per elem
+      if (num_elem != data->points.num_elem) {
+        CeedInt            *points_per_elem;
+        const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+        CeedElemRestriction rstr_points = NULL;
+
+        data->points.num_elem = num_elem;
+        CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+        CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+        for (CeedInt e = 0; e < num_elem; e++) {
+          CeedInt num_points_elem;
+
+          CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+          points_per_elem[e] = num_points_elem;
+        }
+        if (data->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)data->points.num_per_elem));
+        CeedCallHip(ceed, hipMalloc((void **)&data->points.num_per_elem, num_bytes));
+        CeedCallHip(ceed, hipMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, hipMemcpyHostToDevice));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+        CeedCallBackend(CeedFree(&points_per_elem));
+      }
+    }
+
+    // Get context data
+    CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
+
+    // Assembly array
+    CeedCallBackend(CeedVectorGetArray(assembled, CEED_MEM_DEVICE, &assembled_array));
+    CeedScalar *assembled_offset_array = &assembled_array[offset];
+
+    // Assemble diagonal
+    void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields,          &data->B,
+                      &data->G,          &data->W,      &data->points,  &assembled_offset_array};
+
+    CeedInt block_sizes[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1};
+
+    CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
+    block_sizes[2] = 1;
+    if (data->dim == 1) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
+                                                    &is_run_good, opargs));
+    } else if (data->dim == 2) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
+                                                    &is_run_good, opargs));
+    } else if (data->dim == 3) {
+      CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+      CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
+
+      CeedCallBackend(CeedTryRunKernelDimShared_Hip(ceed, data->assemble_full, NULL, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem,
+                                                    &is_run_good, opargs));
+    }
+    CeedCallHip(ceed, hipDeviceSynchronize());
+
+    // Restore input arrays
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      } else {
+        bool       is_active;
+        CeedVector vec;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+        is_active = vec == CEED_VECTOR_ACTIVE;
+        if (!is_active) CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]));
+        CeedCallBackend(CeedVectorDestroy(&vec));
+      }
+    }
+
+    // Restore point coordinates
+    {
+      CeedVector vec;
+
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+      CeedCallBackend(CeedVectorDestroy(&vec));
+    }
+
+    // Restore context data
+    CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+    // Restore assembly array
+    CeedCallBackend(CeedVectorRestoreArray(assembled, &assembled_array));
+
+    // Cleanup
+    CeedCallBackend(CeedQFunctionDestroy(&qf));
+    if (!is_run_good) {
+      CeedDebug(ceed, "Single Operator Assemble at Points run failed, using fallback\n");
+      data->use_assembly_fallback = true;
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
+
+  // Fallback, if needed
+  if (data->use_assembly_fallback) {
+    CeedOperator op_fallback;
+
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for AtPoints SingleOperatorAssemble\n");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorAssembleSingle(op_fallback, offset, assembled));
+    return CEED_ERROR_SUCCESS;
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -184,14 +875,30 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
 // Create operator
 //------------------------------------------------------------------------------
 int CeedOperatorCreate_Hip_gen(CeedOperator op) {
+  bool                  is_composite, is_at_points;
   Ceed                  ceed;
   CeedOperator_Hip_gen *impl;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedOperatorSetData(op, impl));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen));
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  if (is_composite) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAddComposite", CeedOperatorApplyAddComposite_Hip_gen));
+  } else {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen));
+  }
+  CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+  if (is_at_points) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip_gen));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Hip_gen));
+  }
+  if (!is_at_points) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip_gen));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip_gen));
+  }
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-gen/ceed-hip-gen-qfunction.c b/backends/hip-gen/ceed-hip-gen-qfunction.c
index ed10d81ad3..872f312594 100644
--- a/backends/hip-gen/ceed-hip-gen-qfunction.c
+++ b/backends/hip-gen/ceed-hip-gen-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -27,7 +27,6 @@ static int CeedQFunctionDestroy_Hip_gen(CeedQFunction qf) {
 
   CeedCallBackend(CeedQFunctionGetData(qf, &data));
   CeedCallHip(CeedQFunctionReturnCeed(qf), hipFree(data->d_c));
-  CeedCallBackend(CeedFree(&data->qfunction_source));
   CeedCallBackend(CeedFree(&data));
   return CEED_ERROR_SUCCESS;
 }
@@ -43,15 +42,11 @@ int CeedQFunctionCreate_Hip_gen(CeedQFunction qf) {
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedQFunctionSetData(qf, data));
 
-  // Read QFunction source
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
-  CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");
-  CeedCheck(data->qfunction_source, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Hip_gen));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-gen/ceed-hip-gen.c b/backends/hip-gen/ceed-hip-gen.c
index d66ceb041a..8b3ead0db7 100644
--- a/backends/hip-gen/ceed-hip-gen.c
+++ b/backends/hip-gen/ceed-hip-gen.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -17,10 +17,9 @@
 // Backend init
 //------------------------------------------------------------------------------
 static int CeedInit_Hip_gen(const char *resource, Ceed ceed) {
-  char      *resource_root;
-  const char fallback_resource[] = "/gpu/hip/ref";
-  Ceed       ceed_shared;
-  Ceed_Hip  *data;
+  char     *resource_root;
+  Ceed      ceed_shared, ceed_ref;
+  Ceed_Hip *data;
 
   CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root));
   CeedCheck(!strcmp(resource_root, "/gpu/hip") || !strcmp(resource_root, "/gpu/hip/gen"), ceed, CEED_ERROR_BACKEND,
@@ -33,11 +32,16 @@ static int CeedInit_Hip_gen(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedInit("/gpu/hip/shared", &ceed_shared));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
+  CeedCallBackend(CeedDestroy(&ceed_shared));
 
-  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource));
+  CeedCallBackend(CeedInit("/gpu/hip/ref", &ceed_ref));
+  CeedCallBackend(CeedSetOperatorFallbackCeed(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip_gen));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "CompositeOperatorCreate", CeedOperatorCreate_Hip_gen));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreate_Hip_gen));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h
index a0a8ac5511..1590f217f2 100644
--- a/backends/hip-gen/ceed-hip-gen.h
+++ b/backends/hip-gen/ceed-hip-gen.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -12,21 +12,24 @@
 #include <hip/hip_runtime.h>
 
 typedef struct {
+  bool          use_fallback, use_assembly_fallback;
   CeedInt       dim;
-  CeedInt       Q_1d;
+  CeedInt       Q, Q_1d;
   CeedInt       max_P_1d;
-  hipModule_t   module;
-  hipFunction_t op;
+  CeedInt       thread_1d;
+  hipStream_t   streams[CEED_COMPOSITE_MAX];
+  hipModule_t   module, module_assemble_full, module_assemble_diagonal, module_assemble_qfunction;
+  hipFunction_t op, assemble_full, assemble_diagonal, assemble_qfunction;
   FieldsInt_Hip indices;
   Fields_Hip    fields;
   Fields_Hip    B;
   Fields_Hip    G;
   CeedScalar   *W;
+  Points_Hip    points;
 } CeedOperator_Hip_gen;
 
 typedef struct {
   const char *qfunction_name;
-  const char *qfunction_source;
   void       *d_c;
 } CeedQFunction_Hip_gen;
 
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index 3163018669..a05bba5006 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,6 +8,7 @@
 #include <ceed.h>
 #include <ceed/backend.h>
 #include <ceed/jit-tools.h>
+#include <string.h>
 #include <hip/hip_runtime.h>
 
 #include "../hip/ceed-hip-common.h"
@@ -17,7 +18,8 @@
 //------------------------------------------------------------------------------
 // Basis apply - tensor
 //------------------------------------------------------------------------------
-int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                  CeedVector u, CeedVector v) {
   Ceed              ceed;
   CeedInt           Q_1d, dim;
   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
@@ -32,15 +34,14 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
-
-  // Clear v for transpose operation
-  if (is_transpose) {
-    CeedSize length;
-
-    CeedCallBackend(CeedVectorGetLength(v, &length));
-    CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
+
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
@@ -59,6 +60,7 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod
       CeedCallBackend(CeedRunKernel_Hip(ceed, data->Grad, num_elem, block_size, grad_args));
     } break;
     case CEED_EVAL_WEIGHT: {
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
       void     *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
       const int block_size_x  = Q_1d;
       const int block_size_y  = dim >= 2 ? Q_1d : 1;
@@ -78,14 +80,179 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAdd_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                 CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Basis apply - tensor AtPoints
+//------------------------------------------------------------------------------
+static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
+                                          CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  Ceed              ceed;
+  CeedInt           Q_1d, dim, max_num_points = num_points[0];
+  const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
+  const int         max_block_size = 32;
+  const CeedScalar *d_x, *d_u;
+  CeedScalar       *d_v;
+  CeedBasis_Hip    *data;
+
+  CeedCallBackend(CeedBasisGetData(basis, &data));
+  CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+
+  // Weight handled separately
+  if (eval_mode == CEED_EVAL_WEIGHT) {
+    CeedCallBackend(CeedVectorSetValue(v, 1.0));
+    return CEED_ERROR_SUCCESS;
+  }
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
+      CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
+    }
+  }
+
+  // Build kernels if needed
+  if (data->num_points != max_num_points) {
+    CeedInt P_1d;
+
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    data->num_points = max_num_points;
+
+    // -- Create interp matrix to Chebyshev coefficients
+    if (!data->d_chebyshev_interp_1d) {
+      CeedSize    interp_bytes;
+      CeedScalar *chebyshev_interp_1d;
+
+      interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+      CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+      CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+      CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
+      CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
+      CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+    }
+
+    // -- Compile kernels
+    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h>\n";
+    CeedInt    num_comp;
+
+    if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
+                                    Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                    "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
+                                    max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
+  }
+
+  // Get read/write access to u, v
+  CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  }
+
+  // Basis action
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+
+      CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size,
+                                        interp_args));
+    } break;
+    case CEED_EVAL_GRAD: {
+      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+
+      CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
+    } break;
+    case CEED_EVAL_WEIGHT:
+    case CEED_EVAL_NONE: /* handled separately below */
+      break;
+    // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+
+  // Restore vectors, cover CEED_EVAL_NONE
+  CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
+  if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                      CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                         CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Basis apply - non-tensor
 //------------------------------------------------------------------------------
-int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                CeedVector v) {
+static int CeedBasisApplyNonTensorCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                           CeedVector u, CeedVector v) {
   Ceed                    ceed;
   CeedInt                 num_nodes, num_qpts;
   const CeedInt           is_transpose    = t_mode == CEED_TRANSPOSE;
@@ -103,14 +270,12 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
-
-  // Clear v for transpose operation
-  if (is_transpose) {
-    CeedSize length;
-
-    CeedCallBackend(CeedVectorGetLength(v, &length));
-    CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    // Clear v for transpose operation
+    if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
   }
 
   // Apply basis operation
@@ -156,6 +321,7 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra
       }
     } break;
     case CEED_EVAL_WEIGHT: {
+      CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v};
 
       CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args));
@@ -168,6 +334,19 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                       CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                          CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -181,10 +360,15 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallHip(ceed, hipModuleUnload(data->module));
-  CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
+  if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
+  if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
   CeedCallHip(ceed, hipFree(data->d_interp_1d));
   CeedCallHip(ceed, hipFree(data->d_grad_1d));
+  CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
   CeedCallBackend(CeedFree(&data));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -198,12 +382,13 @@ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallHip(ceed, hipModuleUnload(data->module));
-  CeedCallHip(ceed, hipFree(data->d_q_weight));
+  if (data->d_q_weight) CeedCallHip(ceed, hipFree(data->d_q_weight));
   CeedCallHip(ceed, hipFree(data->d_interp));
   CeedCallHip(ceed, hipFree(data->d_grad));
   CeedCallHip(ceed, hipFree(data->d_div));
   CeedCallHip(ceed, hipFree(data->d_curl));
   CeedCallBackend(CeedFree(&data));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -213,8 +398,6 @@ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) {
 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
   Ceed           ceed;
-  char          *basis_kernel_source;
-  const char    *basis_kernel_path;
   CeedInt        num_comp;
   const CeedInt  q_bytes      = Q_1d * sizeof(CeedScalar);
   const CeedInt  interp_bytes = q_bytes * P_1d;
@@ -224,33 +407,35 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
   CeedCallBackend(CeedCalloc(1, &data));
 
   // Copy data to GPU
-  CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
-  CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
+  if (q_weight_1d) {
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
+  }
   CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
   CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice));
   CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes));
   CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice));
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
-                                  num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
+                                  Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
                                   "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -260,8 +445,6 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
                           const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                    ceed;
-  char                   *basis_kernel_source;
-  const char             *basis_kernel_path;
   CeedInt                 num_comp, q_comp_interp, q_comp_grad;
   const CeedInt           q_bytes = num_qpts * sizeof(CeedScalar);
   CeedBasisNonTensor_Hip *data;
@@ -272,8 +455,10 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
-  CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
-  CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
+  if (q_weight) {
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
+  }
   if (interp) {
     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
 
@@ -288,11 +473,9 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
                                   q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
@@ -300,14 +483,14 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -317,8 +500,6 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
 int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div,
                             const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                    ceed;
-  char                   *basis_kernel_source;
-  const char             *basis_kernel_path;
   CeedInt                 num_comp, q_comp_interp, q_comp_div;
   const CeedInt           q_bytes = num_qpts * sizeof(CeedScalar);
   CeedBasisNonTensor_Hip *data;
@@ -329,8 +510,10 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
-  CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
-  CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
+  if (q_weight) {
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
+  }
   if (interp) {
     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
 
@@ -345,11 +528,9 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
                                   q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
@@ -357,14 +538,14 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -374,8 +555,6 @@ int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_node
 int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
                              const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
   Ceed                    ceed;
-  char                   *basis_kernel_source;
-  const char             *basis_kernel_path;
   CeedInt                 num_comp, q_comp_interp, q_comp_curl;
   const CeedInt           q_bytes = num_qpts * sizeof(CeedScalar);
   CeedBasisNonTensor_Hip *data;
@@ -386,8 +565,10 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
-  CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
-  CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
+  if (q_weight) {
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
+  }
   if (interp) {
     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
 
@@ -402,11 +583,9 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   }
 
   // Compile basis kernels
+  const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n";
+
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
                                   q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
@@ -414,14 +593,14 @@ int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 486d9bc400..4ef3c76bdb 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -26,20 +26,28 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Apply data
-  for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
-    CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
-  }
-  CeedCallBackend(CeedFree(&impl->e_vecs));
+  CeedCallBackend(CeedFree(&impl->num_points));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_in));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_out));
+  CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
+  CeedCallBackend(CeedFree(&impl->input_field_order));
+  CeedCallBackend(CeedFree(&impl->output_field_order));
+  CeedCallBackend(CeedFree(&impl->input_states));
 
   for (CeedInt i = 0; i < impl->num_inputs; i++) {
+    CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i]));
     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i]));
   }
+  CeedCallBackend(CeedFree(&impl->e_vecs_in));
   CeedCallBackend(CeedFree(&impl->q_vecs_in));
 
   for (CeedInt i = 0; i < impl->num_outputs; i++) {
+    CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i]));
     CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
   }
+  CeedCallBackend(CeedFree(&impl->e_vecs_out));
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
+  CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem));
 
   // QFunction assembly data
   for (CeedInt i = 0; i < impl->num_active_in; i++) {
@@ -69,10 +77,11 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
     CeedCallHip(ceed, hipFree(impl->diag->d_div_out));
     CeedCallHip(ceed, hipFree(impl->diag->d_curl_in));
     CeedCallHip(ceed, hipFree(impl->diag->d_curl_out));
-    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
-    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
+    CeedCallBackend(CeedDestroy(&ceed));
     CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag));
     CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag));
+    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
+    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
   }
   CeedCallBackend(CeedFree(&impl->diag));
 
@@ -83,6 +92,7 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
     CeedCallHip(ceed, hipModuleUnload(impl->asmb->module));
     CeedCallHip(ceed, hipFree(impl->asmb->d_B_in));
     CeedCallHip(ceed, hipFree(impl->asmb->d_B_out));
+    CeedCallBackend(CeedDestroy(&ceed));
   }
   CeedCallBackend(CeedFree(&impl->asmb));
 
@@ -93,8 +103,8 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
 //------------------------------------------------------------------------------
 // Setup infields or outfields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e,
-                                       CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
+static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, bool *apply_add_basis,
+                                       CeedVector *e_vecs, CeedVector *q_vecs, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
   Ceed                ceed;
   CeedQFunctionField *qf_fields;
   CeedOperatorField  *op_fields;
@@ -110,68 +120,115 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
 
   // Loop over fields
   for (CeedInt i = 0; i < num_fields; i++) {
-    bool         is_strided = false, skip_restriction = false;
-    CeedSize     q_size;
-    CeedInt      size;
-    CeedEvalMode eval_mode;
-    CeedBasis    basis;
+    bool                is_active = false, is_strided = false, skip_e_vec = false;
+    CeedSize            q_size;
+    CeedInt             size;
+    CeedEvalMode        eval_mode;
+    CeedVector          l_vec;
+    CeedElemRestriction elem_rstr;
 
+    // Check whether this field can skip the element restriction:
+    // Input CEED_VECTOR_ACTIVE
+    // Output CEED_VECTOR_ACTIVE without CEED_EVAL_NONE
+    // Input CEED_VECTOR_NONE with CEED_EVAL_WEIGHT
+    // Input passive vector with CEED_EVAL_NONE and strided restriction with CEED_STRIDES_BACKEND
+    CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
-    if (eval_mode != CEED_EVAL_WEIGHT) {
-      CeedElemRestriction elem_rstr;
-
-      // Check whether this field can skip the element restriction:
-      // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
-
-      // First, check whether the field is input or output:
-      if (is_input) {
-        CeedVector vec;
-
-        // Check for passive input
-        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
-        if (vec != CEED_VECTOR_ACTIVE) {
-          // Check eval_mode
-          if (eval_mode == CEED_EVAL_NONE) {
-            // Check for strided restriction
-            CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
-            if (is_strided) {
-              // Check if vector is already in preferred backend ordering
-              CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction));
-            }
-          }
-        }
-      }
-      if (skip_restriction) {
-        // We do not need an E-Vector, but will use the input field vector's data directly in the operator application.
-        e_vecs[i + start_e] = NULL;
-      } else {
-        CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e]));
-      }
+    skip_e_vec = (is_input && is_active) || (is_active && eval_mode != CEED_EVAL_NONE) || (eval_mode == CEED_EVAL_WEIGHT);
+    if (!skip_e_vec && is_input && !is_active && eval_mode == CEED_EVAL_NONE) {
+      CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
+      if (is_strided) CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_e_vec));
     }
+    if (skip_e_vec) {
+      e_vecs[i] = NULL;
+    } else {
+      CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i]));
+    }
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
-        q_size = (CeedSize)num_elem * Q * size;
-        CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
-        break;
       case CEED_EVAL_INTERP:
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
-        q_size = (CeedSize)num_elem * Q * size;
+        q_size = (CeedSize)num_elem * (CeedSize)Q * (CeedSize)size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         break;
-      case CEED_EVAL_WEIGHT:  // Only on input fields
+      case CEED_EVAL_WEIGHT: {
+        CeedBasis basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
-        q_size = (CeedSize)num_elem * Q;
+        q_size = (CeedSize)num_elem * (CeedSize)Q;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        if (is_at_points) {
+          CeedInt num_points[num_elem];
+
+          for (CeedInt i = 0; i < num_elem; i++) num_points[i] = Q;
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE,
+                                                 q_vecs[i]));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
     }
   }
+  // Drop duplicate restrictions
+  if (is_input) {
+    for (CeedInt i = 0; i < num_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          skip_rstr[j] = true;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  } else {
+    for (CeedInt i = num_fields - 1; i >= 0; i--) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i - 1; j >= 0; j--) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          if (e_vecs[i]) CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          skip_rstr[j]       = true;
+          apply_add_basis[i] = true;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -179,7 +236,6 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
 // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
 //------------------------------------------------------------------------------
 static int CeedOperatorSetup_Hip(CeedOperator op) {
-  Ceed                ceed;
   bool                is_setup_done;
   CeedInt             Q, num_elem, num_input_fields, num_output_fields;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
@@ -190,7 +246,6 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
   if (is_setup_done) return CEED_ERROR_SUCCESS;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -199,159 +254,623 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Allocate
-  CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
-  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->e_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->e_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->apply_add_basis_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_field_order));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->output_field_order));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_states));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->q_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->q_vecs_out));
   impl->num_inputs  = num_input_fields;
   impl->num_outputs = num_output_fields;
 
-  // Set up infield and outfield e_vecs and q_vecs
-  // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
-  // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
+  // Set up infield and outfield e-vecs and q-vecs
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields, Q,
+                                              num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out,
+                                              impl->q_vecs_out, num_output_fields, Q, num_elem));
 
+  // Reorder fields to allow reuse of buffers
+  impl->max_active_e_vec_len = 0;
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedSize            e_vec_len_i;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      is_ordered[i]                       = true;
+      impl->input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) {
+        // CEED_EVAL_WEIGHT
+        CeedCallBackend(CeedVectorDestroy(&vec_i));
+        continue;
+      };
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                       = true;
+          impl->input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedSize            e_vec_len_i;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      is_ordered[i]                        = true;
+      impl->output_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
+      for (CeedInt j = i + 1; j < num_output_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                        = true;
+          impl->output_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len));
+  {
+    // Create two work vectors for diagonal assembly
+    CeedVector temp_1, temp_2;
+
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_1));
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_2));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_1));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_2));
+  }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
-// Setup Operator Inputs
+// Restrict Operator Inputs
 //------------------------------------------------------------------------------
-static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
-                                              CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
-                                              CeedOperator_Hip *impl, CeedRequest *request) {
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
+static inline int CeedOperatorInputRestrict_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                                CeedVector in_vec, CeedVector active_e_vec, const bool skip_active, CeedOperator_Hip *impl,
+                                                CeedRequest *request) {
+  bool       is_active = false;
+  CeedVector l_vec, e_vec = impl->e_vecs_in[input_field];
+
+  // Get input vector
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
+  }
 
-    // Get input vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) {
-      if (skip_active) continue;
-      else vec = in_vec;
-    }
+  // Restriction action
+  if (e_vec) {
+    // Restrict, if necessary
+    if (!impl->skip_rstr_in[input_field]) {
+      uint64_t state;
 
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
-    } else {
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      // Get input element restriction
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-      if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
-      // Restrict, if necessary
-      if (!impl->e_vecs[i]) {
-        // No restriction for this field; read data directly from vec.
-        CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
-      } else {
-        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
-        // Get evec
-        CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
+      CeedCallBackend(CeedVectorGetState(l_vec, &state));
+      if (is_active || state != impl->input_states[input_field]) {
+        CeedElemRestriction elem_rstr;
+
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_field, &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, l_vec, e_vec, request));
+        CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       }
+      impl->input_states[input_field] = state;
     }
   }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Input Basis Action
 //------------------------------------------------------------------------------
-static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
-                                             CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
+static inline int CeedOperatorInputBasis_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                             CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const bool skip_active,
                                              CeedOperator_Hip *impl) {
+  bool         is_active = false;
+  CeedEvalMode eval_mode;
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
+
+  // Skip active input
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
+  }
+
+  // Basis action
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  switch (eval_mode) {
+    case CEED_EVAL_NONE: {
+      const CeedScalar *e_vec_array;
+
+      if (e_vec) {
+        CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, &e_vec_array));
+      } else {
+        CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, &e_vec_array));
+      }
+      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
+      break;
+    }
+    case CEED_EVAL_INTERP:
+    case CEED_EVAL_GRAD:
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL: {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
+      CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, e_vec, q_vec));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      break;
+    }
+    case CEED_EVAL_WEIGHT:
+      break;  // No action
+  }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Restore Input Vectors
+//------------------------------------------------------------------------------
+static inline int CeedOperatorInputRestore_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                               CeedVector in_vec, CeedVector active_e_vec, const bool skip_active, CeedOperator_Hip *impl) {
+  bool         is_active = false;
+  CeedEvalMode eval_mode;
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field];
+
+  // Skip active input
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (is_active && skip_active) return CEED_ERROR_SUCCESS;
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
+  }
+
+  // Restore e-vec
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  if (eval_mode == CEED_EVAL_NONE) {
+    const CeedScalar *e_vec_array;
+
+    CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_in[input_field], CEED_MEM_DEVICE, (CeedScalar **)&e_vec_array));
+    if (e_vec) {
+      CeedCallBackend(CeedVectorRestoreArrayRead(e_vec, &e_vec_array));
+    } else {
+      CeedCallBackend(CeedVectorRestoreArrayRead(l_vec, &e_vec_array));
+    }
+  }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Apply and add to output
+//------------------------------------------------------------------------------
+static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
+  CeedInt             Q, num_elem, num_input_fields, num_output_fields;
+  Ceed                ceed;
+  CeedVector          active_e_vec;
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Hip   *impl;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Setup
+  CeedCallBackend(CeedOperatorSetup_Hip(op));
+
+  // Work vector
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec));
+
+  // Process inputs
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt             elem_size, size;
-    CeedEvalMode        eval_mode;
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+    CeedInt field = impl->input_field_order[i];
 
-    // Skip active input
-    if (skip_active) {
-      CeedVector vec;
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem, false, impl));
+  }
+
+  // Output pointers, as necessary
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedScalar *e_vec_array;
 
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
     }
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
+  }
+
+  // Q function
+  CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));
+
+  // Restore input arrays
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, in_vec, active_e_vec, false, impl));
+  }
+
+  // Output basis and restriction
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool         is_active = false;
+    CeedInt      field     = impl->output_field_order[i];
+    CeedEvalMode eval_mode;
+    CeedVector   l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
+
+    // Output vector
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
+      l_vec = out_vec;
+      if (!e_vec) e_vec = active_e_vec;
+    }
+
     // Basis action
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode));
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
-        break;
+        break;  // No action
       case CEED_EVAL_INTERP:
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i]));
+      case CEED_EVAL_CURL: {
+        CeedBasis basis;
+
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis));
+        if (impl->apply_add_basis_out[field]) {
+          CeedCallBackend(CeedBasisApplyAdd(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, q_vec, e_vec));
+        }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
-      case CEED_EVAL_WEIGHT:
-        break;  // No action
+      }
+      // LCOV_EXCL_START
+      case CEED_EVAL_WEIGHT: {
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        // LCOV_EXCL_STOP
+      }
+    }
+
+    // Restore evec
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[field], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
+    }
+
+    // Restrict
+    if (!impl->skip_rstr_out[field]) {
+      CeedElemRestriction elem_rstr;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   }
+
+  // Return work vector
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
-// Restore Input Vectors
+// CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction.
 //------------------------------------------------------------------------------
-static inline int CeedOperatorRestoreInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
-                                                const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) {
-  for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode eval_mode;
-    CeedVector   vec;
+static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
+  bool                is_setup_done;
+  CeedInt             max_num_points = -1, num_elem, num_input_fields, num_output_fields;
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Hip   *impl;
+
+  CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
+  if (is_setup_done) return CEED_ERROR_SUCCESS;
 
-    // Skip active input
-    if (skip_active) {
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  {
+    CeedElemRestriction rstr_points = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedCalloc(num_elem, &impl->num_points));
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt num_points_elem;
+
+      CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+      impl->num_points[e] = num_points_elem;
     }
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
-    } else {
-      if (!impl->e_vecs[i]) {  // This was a skip_restriction case
-        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-        CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i]));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+  impl->max_num_points = max_num_points;
+
+  // Allocate
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->e_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->e_vecs_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->apply_add_basis_out));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_field_order));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->output_field_order));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->input_states));
+  CeedCallBackend(CeedCalloc(num_input_fields, &impl->q_vecs_in));
+  CeedCallBackend(CeedCalloc(num_output_fields, &impl->q_vecs_out));
+  impl->num_inputs  = num_input_fields;
+  impl->num_outputs = num_output_fields;
+
+  // Set up infield and outfield e-vecs and q-vecs
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs_in, impl->q_vecs_in, num_input_fields,
+                                              max_num_points, num_elem));
+  CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_out, impl->q_vecs_out,
+                                              num_output_fields, max_num_points, num_elem));
+
+  // Reorder fields to allow reuse of buffers
+  impl->max_active_e_vec_len = 0;
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedSize            e_vec_len_i;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      is_ordered[i]                       = true;
+      impl->input_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
+      if (vec_i == CEED_VECTOR_NONE) {
+        // CEED_EVAL_WEIGHT
+        CeedCallBackend(CeedVectorDestroy(&vec_i));
+        continue;
+      };
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
+      for (CeedInt j = i + 1; j < num_input_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                       = true;
+          impl->input_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  {
+    bool    is_ordered[CEED_FIELD_MAX];
+    CeedInt curr_index = 0;
+
+    for (CeedInt i = 0; i < num_output_fields; i++) is_ordered[i] = false;
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedSize            e_vec_len_i;
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      if (is_ordered[i]) continue;
+      is_ordered[i]                        = true;
+      impl->output_field_order[curr_index] = i;
+      curr_index++;
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i));
+      CeedCallBackend(CeedElemRestrictionGetEVectorSize(rstr_i, &e_vec_len_i));
+      impl->max_active_e_vec_len = e_vec_len_i > impl->max_active_e_vec_len ? e_vec_len_i : impl->max_active_e_vec_len;
+      for (CeedInt j = i + 1; j < num_output_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
+        if (rstr_i == rstr_j && vec_i == vec_j) {
+          is_ordered[j]                        = true;
+          impl->output_field_order[curr_index] = j;
+          curr_index++;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  CeedCallBackend(CeedClearWorkVectors(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len));
+  {
+    // Create two work vectors for diagonal assembly
+    CeedVector temp_1, temp_2;
+
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_1));
+    CeedCallBackend(CeedGetWorkVector(CeedOperatorReturnCeed(op), impl->max_active_e_vec_len, &temp_2));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_1));
+    CeedCallBackend(CeedRestoreWorkVector(CeedOperatorReturnCeed(op), &temp_2));
+  }
+  CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Input Basis Action AtPoints
+//------------------------------------------------------------------------------
+static inline int CeedOperatorInputBasisAtPoints_Hip(CeedOperatorField op_input_field, CeedQFunctionField qf_input_field, CeedInt input_field,
+                                                     CeedVector in_vec, CeedVector active_e_vec, CeedInt num_elem, const CeedInt *num_points,
+                                                     const bool skip_active, const bool skip_passive, CeedOperator_Hip *impl) {
+  bool         is_active = false;
+  CeedEvalMode eval_mode;
+  CeedVector   l_vec, e_vec = impl->e_vecs_in[input_field], q_vec = impl->q_vecs_in[input_field];
+
+  // Skip active input
+  CeedCallBackend(CeedOperatorFieldGetVector(op_input_field, &l_vec));
+  is_active = l_vec == CEED_VECTOR_ACTIVE;
+  if (skip_active && is_active) return CEED_ERROR_SUCCESS;
+  if (skip_passive && !is_active) {
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
+    return CEED_ERROR_SUCCESS;
+  }
+  if (is_active) {
+    l_vec = in_vec;
+    if (!e_vec) e_vec = active_e_vec;
+  }
+
+  // Basis action
+  CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_field, &eval_mode));
+  switch (eval_mode) {
+    case CEED_EVAL_NONE: {
+      const CeedScalar *e_vec_array;
+
+      if (e_vec) {
+        CeedCallBackend(CeedVectorGetArrayRead(e_vec, CEED_MEM_DEVICE, &e_vec_array));
       } else {
-        CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i]));
+        CeedCallBackend(CeedVectorGetArrayRead(l_vec, CEED_MEM_DEVICE, &e_vec_array));
       }
+      CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, (CeedScalar *)e_vec_array));
+      break;
+    }
+    case CEED_EVAL_INTERP:
+    case CEED_EVAL_GRAD:
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL: {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_field, &basis));
+      CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_NOTRANSPOSE, eval_mode, impl->point_coords_elem, e_vec, q_vec));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      break;
     }
+    case CEED_EVAL_WEIGHT:
+      break;  // No action
   }
+  if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
-// Apply and add to output
+// Apply and add to output AtPoints
 //------------------------------------------------------------------------------
-static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
-  CeedInt             Q, num_elem, elem_size, num_input_fields, num_output_fields, size;
-  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
+static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
+  CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
+  Ceed                ceed;
+  CeedVector          active_e_vec;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Hip   *impl;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Setup
-  CeedCallBackend(CeedOperatorSetup_Hip(op));
+  CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op));
+  num_points     = impl->num_points;
+  max_num_points = impl->max_num_points;
 
-  // Input Evecs and Restriction
-  CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
+  // Work vector
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec));
 
-  // Input basis apply if needed
-  CeedCallBackend(CeedOperatorInputBasis_Hip(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl));
+  // Get point coordinates
+  {
+    CeedVector          point_coords = NULL;
+    CeedElemRestriction rstr_points  = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
+    if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
+    {
+      uint64_t state;
+      CeedCallBackend(CeedVectorGetState(point_coords, &state));
+      if (impl->points_state != state) {
+        CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+      }
+    }
+    CeedCallBackend(CeedVectorDestroy(&point_coords));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+
+  // Process inputs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedInt field = impl->input_field_order[i];
+
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, false, impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[field], qf_input_fields[field], field, in_vec, active_e_vec, num_elem,
+                                                       num_points, false, false, impl));
+  }
 
   // Output pointers, as necessary
   for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -359,68 +878,86 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      // Set the output Q-Vector to use the E-Vector data directly.
-      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields]));
-      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
     }
   }
 
   // Q function
-  CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));
+  CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
 
-  // Output basis apply if needed
+  // Restore input arrays
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, in_vec, active_e_vec, false, impl));
+  }
+
+  // Output basis and restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+    bool         is_active = false;
+    CeedInt      field     = impl->output_field_order[i];
+    CeedEvalMode eval_mode;
+    CeedVector   l_vec, e_vec = impl->e_vecs_out[field], q_vec = impl->q_vecs_out[field];
+
+    // Output vector
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
+      l_vec = out_vec;
+      if (!e_vec) e_vec = active_e_vec;
+    }
 
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
     // Basis action
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field], &eval_mode));
     switch (eval_mode) {
       case CEED_EVAL_NONE:
         break;  // No action
       case CEED_EVAL_INTERP:
       case CEED_EVAL_GRAD:
       case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs]));
+      case CEED_EVAL_CURL: {
+        CeedBasis basis;
+
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field], &basis));
+        if (impl->apply_add_basis_out[field]) {
+          CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+        } else {
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+        }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
-        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
         // LCOV_EXCL_STOP
       }
     }
-  }
-
-  // Output restriction
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
 
     // Restore evec
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
-      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields]));
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[field], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
     }
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+
     // Restrict
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-    // Active
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
+    if (!impl->skip_rstr_out[field]) {
+      CeedElemRestriction elem_rstr;
 
-    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, l_vec, request));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&l_vec));
   }
 
-  // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+  // Restore work vector
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -431,7 +968,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
                                                               CeedRequest *request) {
   Ceed                ceed, ceed_parent;
   CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
-  CeedScalar         *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL};
+  CeedScalar         *assembled_array;
   CeedVector         *active_inputs;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -452,19 +989,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
   // Setup
   CeedCallBackend(CeedOperatorSetup_Hip(op));
 
-  // Input Evecs and Restriction
-  CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
+  // Process inputs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, true, impl));
+  }
 
   // Count number of active input fields
   if (!num_active_in) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
       CeedScalar *q_vec_array;
-      CeedVector  vec;
+      CeedVector  l_vec;
 
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
-      if (vec == CEED_VECTOR_ACTIVE) {
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &l_vec));
+      if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
         CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array));
@@ -473,12 +1012,13 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
           CeedSize q_size = (CeedSize)Q * num_elem;
 
           CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field]));
-          CeedCallBackend(
-              CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem]));
+          CeedCallBackend(CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER,
+                                             &q_vec_array[field * Q * num_elem]));
         }
         num_active_in += size;
         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
       }
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
     }
     impl->num_active_in = num_active_in;
     impl->qf_active_in  = active_inputs;
@@ -487,15 +1027,15 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
   // Count number of active output fields
   if (!num_active_out) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      CeedVector vec;
+      CeedVector l_vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
-      if (vec == CEED_VECTOR_ACTIVE) {
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &l_vec));
+      if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
         num_active_out += size;
       }
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
     }
     impl->num_active_out = num_active_out;
   }
@@ -510,16 +1050,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
 
     // Create output restriction
     CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
-                                                     num_active_in * num_active_out * num_elem * Q, strides, rstr));
+                                                     (CeedSize)num_active_in * (CeedSize)num_active_out * (CeedSize)num_elem * (CeedSize)Q, strides,
+                                                     rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
   }
   CeedCallBackend(CeedVectorSetValue(*assembled, 0.0));
   CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array));
 
-  // Input basis apply
-  CeedCallBackend(CeedOperatorInputBasis_Hip(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl));
-
   // Assemble QFunction
   for (CeedInt in = 0; in < num_active_in; in++) {
     // Set Inputs
@@ -529,38 +1067,42 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
     }
     // Set Outputs
     for (CeedInt out = 0; out < num_output_fields; out++) {
-      CeedVector vec;
+      CeedVector l_vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
       // Check if active output
-      if (vec == CEED_VECTOR_ACTIVE) {
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec));
+      if (l_vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array));
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
         assembled_array += size * Q * num_elem;  // Advance the pointer by the size of the output
       }
+      CeedCallBackend(CeedVectorDestroy(&l_vec));
     }
     // Apply QFunction
     CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
   }
 
-  // Un-set output q_vecs to prevent accidental overwrite of Assembled
+  // Un-set output q-vecs to prevent accidental overwrite of Assembled
   for (CeedInt out = 0; out < num_output_fields; out++) {
-    CeedVector vec;
+    CeedVector l_vec;
 
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
-    // Check if active output
-    if (vec == CEED_VECTOR_ACTIVE) {
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &l_vec));
+    if (l_vec == CEED_VECTOR_ACTIVE) {
       CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL));
     }
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
   }
 
   // Restore input arrays
-  CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl));
+  }
 
   // Restore output
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -604,13 +1146,14 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedBasis    basis;
       CeedEvalMode eval_mode;
+      CeedBasis    basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND,
                 "Backend does not implement operator diagonal assembly with multiple active bases");
-      basis_in = basis;
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -620,6 +1163,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
         num_eval_modes_in += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis
@@ -636,7 +1180,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
       CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
                 "Backend does not implement operator diagonal assembly with multiple active bases");
-      basis_out = basis;
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -646,6 +1191,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
         num_eval_modes_out += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Operator data struct
@@ -757,6 +1303,10 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
   CeedCallHip(ceed, hipMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, hipMemcpyHostToDevice));
   CeedCallBackend(CeedFree(&eval_modes_in));
   CeedCallBackend(CeedFree(&eval_modes_out));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -765,8 +1315,6 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
 //------------------------------------------------------------------------------
 static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) {
   Ceed                ceed;
-  char               *diagonal_kernel_source;
-  const char         *diagonal_kernel_path;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
   CeedInt             num_comp, q_comp, num_nodes, num_qpts;
   CeedBasis           basis_in = NULL, basis_out = NULL;
@@ -788,14 +1336,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op,
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
       CeedEvalMode eval_mode;
+      CeedBasis    basis;
 
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in));
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
         num_eval_modes_in += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis
@@ -807,14 +1359,18 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op,
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
       CeedEvalMode eval_mode;
+      CeedBasis    basis;
 
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out));
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
       if (eval_mode != CEED_EVAL_WEIGHT) {
         num_eval_modes_out += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Operator data struct
@@ -822,22 +1378,22 @@ static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op,
   CeedOperatorDiag_Hip *diag = impl->diag;
 
   // Assemble kernel
-  hipModule_t *module          = is_point_block ? &diag->module_point_block : &diag->module;
-  CeedInt      elems_per_block = 1;
+  const char   diagonal_kernel_source[] = "// Diagonal assembly source\n#include <ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h>\n";
+  hipModule_t *module                   = is_point_block ? &diag->module_point_block : &diag->module;
+  CeedInt      elems_per_block          = 1;
+
   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
   CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
   if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
   else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n");
   CeedCallHip(ceed, CeedCompile_Hip(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
                                     num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE",
                                     use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block));
   CeedCallHip(ceed, CeedGetKernel_Hip(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
-  CeedCallBackend(CeedFree(&diagonal_kernel_path));
-  CeedCallBackend(CeedFree(&diagonal_kernel_source));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -889,6 +1445,8 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect
     CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr));
     CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag));
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
   diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr;
   elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag;
   CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0));
@@ -897,8 +1455,8 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect
   CeedCallBackend(CeedElemRestrictionGetElementSize(diag_rstr, &num_nodes));
   if (num_nodes > 0) {
     // Assemble element operator diagonals
-    CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array));
     CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem));
+    CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array));
 
     // Compute the diagonal of B^T D B
     CeedInt elems_per_block = 1;
@@ -922,6 +1480,7 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect
   CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request));
 
   // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorDestroy(&assembled_qf));
   return CEED_ERROR_SUCCESS;
 }
@@ -945,10 +1504,9 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op,
 //------------------------------------------------------------------------------
 // Single Operator Assembly Setup
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) {
+static int CeedOperatorAssembleSingleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) {
   Ceed                ceed;
-  char               *assembly_kernel_source;
-  const char         *assembly_kernel_path;
+  Ceed_Hip           *hip_data;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
   CeedInt             elem_size_in, num_qpts_in = 0, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp;
   CeedEvalMode       *eval_modes_in = NULL, *eval_modes_out = NULL;
@@ -973,13 +1531,17 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
 
     CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedBasis    basis;
-      CeedEvalMode eval_mode;
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis));
       CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
-      basis_in = basis;
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in));
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &elem_rstr));
+      if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in));
       if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in;
       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in));
@@ -994,6 +1556,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
         num_eval_modes_in += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
@@ -1003,14 +1566,18 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
 
     CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedBasis    basis;
-      CeedEvalMode eval_mode;
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
 
       CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis));
       CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND,
                 "Backend does not implement operator assembly with multiple active bases");
-      basis_out = basis;
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out));
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &elem_rstr));
+      if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out));
       if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out;
       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out));
@@ -1027,6 +1594,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
         num_eval_modes_out += q_comp;
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
   CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
 
@@ -1036,7 +1604,8 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
   asmb->block_size_x             = elem_size_in;
   asmb->block_size_y             = elem_size_out;
 
-  bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > 1024;
+  CeedCallBackend(CeedGetData(ceed, &hip_data));
+  bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > hip_data->device_prop.maxThreadsPerBlock;
 
   if (fallback) {
     // Use fallback kernel with 1D threadblock
@@ -1044,20 +1613,16 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
   }
 
   // Compile kernels
+  const char assembly_kernel_source[] = "// Full assembly source\n#include <ceed/jit-source/hip/hip-ref-operator-assemble.h>\n";
+
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble.h", &assembly_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n");
   CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
                                   num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in,
                                   "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE",
                                   asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "USE_CEEDSIZE",
                                   use_ceedsize_idx));
   CeedCallBackend(CeedGetKernel_Hip(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble));
-  CeedCallBackend(CeedFree(&assembly_kernel_path));
-  CeedCallBackend(CeedFree(&assembly_kernel_source));
 
   // Load into B_in, in order that they will be used in eval_modes_in
   {
@@ -1090,11 +1655,9 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
       CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar),
                                   hipMemcpyHostToDevice));
     }
-
-    if (identity) {
-      CeedCallBackend(CeedFree(&identity));
-    }
+    CeedCallBackend(CeedFree(&identity));
   }
+  CeedCallBackend(CeedFree(&eval_modes_in));
 
   // Load into B_out, in order that they will be used in eval_modes_out
   {
@@ -1127,11 +1690,15 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
       CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar),
                                   hipMemcpyHostToDevice));
     }
-
-    if (identity) {
-      CeedCallBackend(CeedFree(&identity));
-    }
+    CeedCallBackend(CeedFree(&identity));
   }
+  CeedCallBackend(CeedFree(&eval_modes_out));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1139,11 +1706,11 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
 // Assemble matrix data for COO matrix of assembled operator.
 // The sparsity pattern is set by CeedOperatorLinearAssembleSymbolic.
 //
-// Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator (could have multiple basis eval
-// modes).
+// Note that this (and other assembly routines) currently assume only one active input restriction/basis per operator
+// (could have multiple basis eval modes).
 // TODO: allow multiple active input restrictions/basis objects
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedVector values) {
+static int CeedOperatorAssembleSingle_Hip(CeedOperator op, CeedInt offset, CeedVector values) {
   Ceed                ceed;
   CeedSize            values_length = 0, assembled_qf_length = 0;
   CeedInt             use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out;
@@ -1169,7 +1736,7 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV
   if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
 
   // Setup
-  if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Hip(op, use_ceedsize_idx));
+  if (!impl->asmb) CeedCallBackend(CeedOperatorAssembleSingleSetup_Hip(op, use_ceedsize_idx));
   CeedOperatorAssemble_Hip *asmb = impl->asmb;
 
   assert(asmb != NULL);
@@ -1215,8 +1782,8 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV
   void   *args[] = {(void *)&num_elem_in, &asmb->d_B_in,     &asmb->d_B_out,      &orients_in,  &curl_orients_in,
                     &orients_out,         &curl_orients_out, &assembled_qf_array, &values_array};
 
-  CeedCallBackend(
-      CeedRunKernelDimShared_Hip(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args));
+  CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, asmb->LinearAssemble, NULL, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block,
+                                             shared_mem, args));
 
   // Restore arrays
   CeedCallBackend(CeedVectorRestoreArray(values, &values_array));
@@ -1236,6 +1803,272 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV
       CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out));
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Assemble Linear QFunction AtPoints
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleQFunctionAtPoints_Hip(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
+  return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Backend does not implement CeedOperatorLinearAssembleQFunction");
+}
+
+//------------------------------------------------------------------------------
+// Assemble Linear Diagonal AtPoints
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
+  CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
+  Ceed                ceed;
+  CeedVector          active_e_vec_in, active_e_vec_out;
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Hip   *impl;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Setup
+  CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op));
+  num_points     = impl->num_points;
+  max_num_points = impl->max_num_points;
+
+  // Work vector
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_in));
+  CeedCallBackend(CeedGetWorkVector(ceed, impl->max_active_e_vec_len, &active_e_vec_out));
+  {
+    CeedSize length_in, length_out;
+
+    CeedCallBackend(CeedVectorGetLength(active_e_vec_in, &length_in));
+    CeedCallBackend(CeedVectorGetLength(active_e_vec_out, &length_out));
+    // Need input e_vec to be longer
+    if (length_in < length_out) {
+      CeedVector temp = active_e_vec_in;
+
+      active_e_vec_in  = active_e_vec_out;
+      active_e_vec_out = temp;
+    }
+  }
+
+  // Get point coordinates
+  {
+    CeedVector          point_coords = NULL;
+    CeedElemRestriction rstr_points  = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
+    if (!impl->point_coords_elem) CeedCallBackend(CeedElemRestrictionCreateVector(rstr_points, NULL, &impl->point_coords_elem));
+    {
+      uint64_t state;
+      CeedCallBackend(CeedVectorGetState(point_coords, &state));
+      if (impl->points_state != state) {
+        CeedCallBackend(CeedElemRestrictionApply(rstr_points, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+      }
+    }
+    CeedCallBackend(CeedVectorDestroy(&point_coords));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+
+  // Process inputs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestrict_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl, request));
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, num_elem, num_points, true, false,
+                                                       impl));
+  }
+
+  // Output pointers, as necessary
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
+    }
+  }
+
+  // Loop over active fields
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool                is_active = false, is_active_at_points = true;
+    CeedInt             elem_size = 1, num_comp_active = 1, e_vec_size = 0, field_in = impl->input_field_order[i];
+    CeedRestrictionType rstr_type;
+    CeedVector          l_vec;
+    CeedElemRestriction elem_rstr;
+
+    // -- Skip non-active input
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[field_in], &l_vec));
+    is_active = l_vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&l_vec));
+    if (!is_active || impl->skip_rstr_in[field_in]) continue;
+
+    // -- Get active restriction type
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[field_in], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+    is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
+    if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    else elem_size = max_num_points;
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+
+    e_vec_size = elem_size * num_comp_active;
+    CeedCallBackend(CeedVectorSetValue(active_e_vec_in, 0.0));
+    for (CeedInt s = 0; s < e_vec_size; s++) {
+      CeedVector q_vec = impl->q_vecs_in[field_in];
+
+      // Update unit vector
+      {
+        // Note: E-vec strides are node * (1) + comp * (elem_size * num_elem) + elem * (elem_size)
+        CeedInt  node = (s - 1) % elem_size, comp = (s - 1) / elem_size;
+        CeedSize start = node * 1 + comp * (elem_size * num_elem);
+        CeedSize stop  = (comp + 1) * (elem_size * num_elem);
+
+        if (s != 0) CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 0.0));
+
+        node = s % elem_size, comp = s / elem_size;
+        start = node * 1 + comp * (elem_size * num_elem);
+        stop  = (comp + 1) * (elem_size * num_elem);
+        CeedCallBackend(CeedVectorSetValueStrided(active_e_vec_in, start, stop, elem_size, 1.0));
+      }
+
+      // Basis action
+      for (CeedInt j = 0; j < num_input_fields; j++) {
+        CeedInt field = impl->input_field_order[j];
+
+        CeedCallBackend(CeedOperatorInputBasisAtPoints_Hip(op_input_fields[field], qf_input_fields[field], field, NULL, active_e_vec_in, num_elem,
+                                                           num_points, false, true, impl));
+      }
+
+      // Q function
+      CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));
+
+      // Output basis apply if needed
+      for (CeedInt j = 0; j < num_output_fields; j++) {
+        bool                is_active = false;
+        CeedInt             elem_size = 0;
+        CeedInt             field_out = impl->output_field_order[j];
+        CeedRestrictionType rstr_type;
+        CeedEvalMode        eval_mode;
+        CeedVector          l_vec, e_vec = impl->e_vecs_out[field_out], q_vec = impl->q_vecs_out[field_out];
+        CeedElemRestriction elem_rstr;
+
+        // ---- Skip non-active output
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[field_out], &l_vec));
+        is_active = l_vec == CEED_VECTOR_ACTIVE;
+        CeedCallBackend(CeedVectorDestroy(&l_vec));
+        if (!is_active) continue;
+        if (!e_vec) e_vec = active_e_vec_out;
+
+        // ---- Check if elem size matches
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[field_out], &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+        if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) continue;
+        if (rstr_type == CEED_RESTRICTION_POINTS) {
+          CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &elem_size));
+        } else {
+          CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+        }
+        {
+          CeedInt num_comp = 0;
+
+          CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+          if (e_vec_size != num_comp * elem_size) continue;
+        }
+
+        // Basis action
+        CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[field_out], &eval_mode));
+        switch (eval_mode) {
+          case CEED_EVAL_NONE: {
+            CeedScalar *e_vec_array;
+
+            CeedCallBackend(CeedVectorTakeArray(q_vec, CEED_MEM_DEVICE, &e_vec_array));
+            CeedCallBackend(CeedVectorRestoreArray(e_vec, &e_vec_array));
+            break;
+          }
+          case CEED_EVAL_INTERP:
+          case CEED_EVAL_GRAD:
+          case CEED_EVAL_DIV:
+          case CEED_EVAL_CURL: {
+            CeedBasis basis;
+
+            CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[field_out], &basis));
+            if (impl->apply_add_basis_out[field_out]) {
+              CeedCallBackend(CeedBasisApplyAddAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec,
+                                                        e_vec));
+            } else {
+              CeedCallBackend(CeedBasisApplyAtPoints(basis, num_elem, num_points, CEED_TRANSPOSE, eval_mode, impl->point_coords_elem, q_vec, e_vec));
+            }
+            CeedCallBackend(CeedBasisDestroy(&basis));
+            break;
+          }
+          // LCOV_EXCL_START
+          case CEED_EVAL_WEIGHT: {
+            return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+            // LCOV_EXCL_STOP
+          }
+        }
+
+        // Continue if a field that is summed into
+        if (impl->skip_rstr_out[field_out]) {
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+          continue;
+        }
+
+        // Mask output e-vec
+        CeedCallBackend(CeedVectorPointwiseMult(e_vec, active_e_vec_in, e_vec));
+
+        // Restrict
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, e_vec, assembled, request));
+        CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+
+        // Reset q_vec for
+        if (eval_mode == CEED_EVAL_NONE) {
+          CeedScalar *e_vec_array;
+
+          CeedCallBackend(CeedVectorGetArrayWrite(e_vec, CEED_MEM_DEVICE, &e_vec_array));
+          CeedCallBackend(CeedVectorSetArray(q_vec, CEED_MEM_DEVICE, CEED_USE_POINTER, e_vec_array));
+        }
+      }
+
+      // Reset vec
+      if (s == e_vec_size - 1 && i != num_input_fields - 1) CeedCallBackend(CeedVectorSetValue(q_vec, 0.0));
+    }
+  }
+
+  // Restore CEED_EVAL_NONE
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedEvalMode eval_mode;
+
+    // Get eval_mode
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+
+    // Restore evec
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_NONE) {
+      CeedScalar *e_vec_array;
+
+      CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, &e_vec_array));
+      CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &e_vec_array));
+    }
+  }
+
+  // Restore input arrays
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorInputRestore_Hip(op_input_fields[i], qf_input_fields[i], i, NULL, NULL, true, impl));
+  }
+
+  // Restore work vector
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_in));
+  CeedCallBackend(CeedRestoreWorkVector(ceed, &active_e_vec_out));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1249,14 +2082,35 @@ int CeedOperatorCreate_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedOperatorSetData(op, impl));
+
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Hip));
-  CeedCallBackend(
-      CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal",
+                                         CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingle_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Create operator AtPoints
+//------------------------------------------------------------------------------
+int CeedOperatorCreateAtPoints_Hip(CeedOperator op) {
+  Ceed              ceed;
+  CeedOperator_Hip *impl;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedCalloc(1, &impl));
+  CeedCallBackend(CeedOperatorSetData(op, impl));
+
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunctionAtPoints_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
index 222a94fc85..bf938eacc4 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
+++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -25,40 +25,38 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) {
   using std::string;
 
   Ceed                ceed;
-  char               *read_write_kernel_source;
-  const char         *read_write_kernel_path;
   Ceed_Hip           *ceed_Hip;
   CeedInt             num_input_fields, num_output_fields, size;
   CeedQFunctionField *input_fields, *output_fields;
   CeedQFunction_Hip  *data;
 
-  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
-  CeedCallBackend(CeedGetData(ceed, &ceed_Hip));
-  CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data));
-
   // QFunction is built
+  CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data));
   if (data->QFunction) return CEED_ERROR_SUCCESS;
 
-  CeedCheck(data->qfunction_source, ceed, CEED_ERROR_BACKEND, "No QFunction source or hipFunction_t provided.");
+  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &ceed_Hip));
 
   // QFunction kernel generation
   CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
 
   // Build strings for final kernel
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-qfunction.h", &read_write_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n");
-  string        qfunction_source(data->qfunction_source);
   string        qfunction_name(data->qfunction_name);
-  string        read_write(read_write_kernel_source);
   string        kernel_name = "CeedKernelHipRefQFunction_" + qfunction_name;
   ostringstream code;
 
-  // Defintions
-  code << read_write;
-  code << qfunction_source;
-  code << "\n";
+  // Definitions
+  code << "// QFunction source\n";
+  code << "#include <ceed/jit-source/hip/hip-ref-qfunction.h>\n\n";
+  {
+    const char *source_path;
+
+    CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
+    CeedCheck(source_path, ceed, CEED_ERROR_BACKEND, "No QFunction source or hipFunction_t provided.");
+
+    code << "// User QFunction source\n";
+    code << "#include \"" << source_path << "\"\n\n";
+  }
   code << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
   code << "__global__ void " << kernel_name << "(void *ctx, CeedInt Q, Fields_Hip fields) {\n";
 
@@ -69,7 +67,7 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) {
     code << "  const CeedInt size_input_" << i << " = " << size << ";\n";
     code << "  CeedScalar input_" << i << "[size_input_" << i << "];\n";
   }
-  code << "  const CeedScalar* inputs[" << num_input_fields << "];\n";
+  code << "  const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
     code << "  inputs[" << i << "] = input_" << i << ";\n";
   }
@@ -82,7 +80,7 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) {
     code << "  const CeedInt size_output_" << i << " = " << size << ";\n";
     code << "  CeedScalar output_" << i << "[size_output_" << i << "];\n";
   }
-  code << "  CeedScalar* outputs[" << num_output_fields << "];\n";
+  code << "  CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     code << "  outputs[" << i << "] = output_" << i << ";\n";
   }
@@ -111,18 +109,10 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) {
   code << "  }\n";
   code << "}\n";
 
-  // View kernel for debugging
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated QFunction Kernels:\n");
-  CeedDebug(ceed, code.str().c_str());
-
   // Compile kernel
   CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 1, "BLOCK_SIZE", ceed_Hip->opt_block_size));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, kernel_name.c_str(), &data->QFunction));
-
-  // Cleanup
-  CeedCallBackend(CeedFree(&data->qfunction_source));
-  CeedCallBackend(CeedFree(&read_write_kernel_path));
-  CeedCallBackend(CeedFree(&read_write_kernel_source));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.h b/backends/hip-ref/ceed-hip-ref-qfunction-load.h
index dc83256d83..5fc7073046 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction-load.h
+++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/hip-ref/ceed-hip-ref-qfunction.c b/backends/hip-ref/ceed-hip-ref-qfunction.c
index e5e72cfd43..60dd757ee7 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunction.c
+++ b/backends/hip-ref/ceed-hip-ref-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -60,6 +60,7 @@ static int CeedQFunctionApply_Hip(CeedQFunction qf, CeedInt Q, CeedVector *U, Ce
 
   // Restore context
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &data->d_c));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -88,15 +89,12 @@ int CeedQFunctionCreate_Hip(CeedQFunction qf) {
   CeedCallBackend(CeedQFunctionSetData(qf, data));
   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
 
-  // Read QFunction source
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
-  CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
index 52bf13370b..a223fa91d8 100644
--- a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
+++ b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -37,6 +37,7 @@ static inline int CeedQFunctionContextSyncH2D_Hip(const CeedQFunctionContext ctx
     impl->d_data = impl->d_data_owned;
   }
   CeedCallHip(ceed, hipMemcpy(impl->d_data, impl->h_data, ctx_size, hipMemcpyHostToDevice));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -63,6 +64,7 @@ static inline int CeedQFunctionContextSyncD2H_Hip(const CeedQFunctionContext ctx
     impl->h_data = impl->h_data_owned;
   }
   CeedCallHip(ceed, hipMemcpy(impl->h_data, impl->d_data, ctx_size, hipMemcpyDeviceToHost));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -76,7 +78,9 @@ static inline int CeedQFunctionContextSync_Hip(const CeedQFunctionContext ctx, C
     case CEED_MEM_DEVICE:
       return CeedQFunctionContextSyncH2D_Hip(ctx);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -204,6 +208,7 @@ static int CeedQFunctionContextSetDataDevice_Hip(const CeedQFunctionContext ctx,
       impl->d_data          = data;
       break;
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -219,7 +224,9 @@ static int CeedQFunctionContextSetData_Hip(const CeedQFunctionContext ctx, const
     case CEED_MEM_DEVICE:
       return CeedQFunctionContextSetDataDevice_Hip(ctx, copy_mode, data);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -334,6 +341,7 @@ int CeedQFunctionContextCreate_Hip(CeedQFunctionContext ctx) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 625430f9d7..b1cd8b5c06 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -23,36 +23,34 @@
 static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr) {
   Ceed                     ceed;
   bool                     is_deterministic;
-  char                    *restriction_kernel_source;
-  const char              *restriction_kernel_path;
   CeedInt                  num_elem, num_comp, elem_size, comp_stride;
   CeedRestrictionType      rstr_type;
   CeedElemRestriction_Hip *impl;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
-  CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
-  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr, &elem_size));
+  } else {
+    CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
+  }
   is_deterministic = impl->d_l_vec_indices != NULL;
 
   // Compile HIP kernels
   switch (rstr_type) {
     case CEED_RESTRICTION_STRIDED: {
-      bool    has_backend_strides;
-      CeedInt strides[3] = {1, num_elem * elem_size, elem_size};
+      const char restriction_kernel_source[] = "// Strided restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-strided.h>\n";
+      bool       has_backend_strides;
+      CeedInt    strides[3] = {1, num_elem * elem_size, elem_size};
 
       CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
       if (!has_backend_strides) {
         CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
       }
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-strided.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
       CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                       "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM",
                                       strides[2]));
@@ -60,27 +58,30 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose));
     } break;
     case CEED_RESTRICTION_STANDARD: {
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      const char restriction_kernel_source[] = "// Standard restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-offset.h>\n";
+
       CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose));
     } break;
+    case CEED_RESTRICTION_POINTS: {
+      const char restriction_kernel_source[] =
+          "// AtPoints restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-at-points.h>\n\n"
+          "// Standard restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-offset.h>\n";
+
+      CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
+                                      "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
+                                      "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "AtPointsTranspose", &impl->ApplyTranspose));
+    } break;
     case CEED_RESTRICTION_ORIENTED: {
-      const char *offset_kernel_path;
-      char      **file_paths     = NULL;
-      CeedInt     num_file_paths = 0;
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-oriented.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &offset_kernel_path));
-      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      const char restriction_kernel_source[] =
+          "// Oriented restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-oriented.h>\n\n"
+          "// Standard restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-offset.h>\n";
+
       CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
@@ -88,22 +89,12 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose));
-      // Cleanup
-      CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
-      CeedCall(CeedFree(&file_paths));
     } break;
     case CEED_RESTRICTION_CURL_ORIENTED: {
-      const char *offset_kernel_path;
-      char      **file_paths     = NULL;
-      CeedInt     num_file_paths = 0;
-
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h", &restriction_kernel_path));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-      CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &offset_kernel_path));
-      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source));
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      const char restriction_kernel_source[] =
+          "// Curl oriented restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h>\n\n"
+          "// Standard restriction source\n#include <ceed/jit-source/hip/hip-ref-restriction-offset.h>\n";
+
       CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
@@ -113,19 +104,10 @@ static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr)
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose));
       CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose));
-      // Cleanup
-      CeedCallBackend(CeedFree(&offset_kernel_path));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
-      CeedCall(CeedFree(&file_paths));
-    } break;
-    case CEED_RESTRICTION_POINTS: {
-      // LCOV_EXCL_START
-      return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
-      // LCOV_EXCL_STOP
+
     } break;
   }
-  CeedCallBackend(CeedFree(&restriction_kernel_path));
-  CeedCallBackend(CeedFree(&restriction_kernel_source));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -174,6 +156,7 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
 
         CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyNoTranspose, grid, block_size, args));
       } break;
+      case CEED_RESTRICTION_POINTS:
       case CEED_RESTRICTION_STANDARD: {
         void *args[] = {&impl->d_offsets, &d_u, &d_v};
 
@@ -205,11 +188,6 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
           CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args));
         }
       } break;
-      case CEED_RESTRICTION_POINTS: {
-        // LCOV_EXCL_START
-        return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
-        // LCOV_EXCL_STOP
-      } break;
     }
   } else {
     // E-vector -> L-vector
@@ -223,6 +201,17 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
 
         CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
       } break;
+      case CEED_RESTRICTION_POINTS: {
+        if (!is_deterministic) {
+          void *args[] = {&impl->d_offsets, &impl->d_points_per_elem, &d_u, &d_v};
+
+          CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
+        } else {
+          void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_points_per_elem, &impl->d_t_offsets, &d_u, &d_v};
+
+          CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
+        }
+      } break;
       case CEED_RESTRICTION_STANDARD: {
         if (!is_deterministic) {
           void *args[] = {&impl->d_offsets, &d_u, &d_v};
@@ -290,11 +279,6 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
           }
         }
       } break;
-      case CEED_RESTRICTION_POINTS: {
-        // LCOV_EXCL_START
-        return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
-        // LCOV_EXCL_STOP
-      } break;
     }
   }
 
@@ -303,6 +287,7 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
   // Restore arrays
   CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -334,14 +319,16 @@ static int CeedElemRestrictionApplyUnoriented_Hip(CeedElemRestriction rstr, Ceed
 //------------------------------------------------------------------------------
 static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) {
   CeedElemRestriction_Hip *impl;
+  CeedRestrictionType      rstr_type;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   switch (mem_type) {
     case CEED_MEM_HOST:
-      *offsets = impl->h_offsets;
+      *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->h_offsets_at_points : impl->h_offsets;
       break;
     case CEED_MEM_DEVICE:
-      *offsets = impl->d_offsets;
+      *offsets = rstr_type == CEED_RESTRICTION_POINTS ? impl->d_offsets_at_points : impl->d_offsets;
       break;
   }
   return CEED_ERROR_SUCCESS;
@@ -383,6 +370,17 @@ static int CeedElemRestrictionGetCurlOrientations_Hip(CeedElemRestriction rstr,
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Get offset for padded AtPoints E-layout
+//------------------------------------------------------------------------------
+static int CeedElemRestrictionGetAtPointsElementOffset_Hip(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset) {
+  CeedInt layout[3];
+
+  CeedCallBackend(CeedElemRestrictionGetELayout(rstr, layout));
+  *elem_offset = 0 * layout[0] + 0 * layout[1] + elem * layout[2];
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Destroy restriction
 //------------------------------------------------------------------------------
@@ -404,25 +402,31 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction rstr) {
   CeedCallHip(ceed, hipFree((bool *)impl->d_orients_owned));
   CeedCallBackend(CeedFree(&impl->h_curl_orients_owned));
   CeedCallHip(ceed, hipFree((CeedInt8 *)impl->d_curl_orients_owned));
+  CeedCallBackend(CeedFree(&impl->h_offsets_at_points_owned));
+  CeedCallHip(ceed, hipFree((CeedInt8 *)impl->d_offsets_at_points_owned));
+  CeedCallBackend(CeedFree(&impl->h_points_per_elem_owned));
+  CeedCallHip(ceed, hipFree((CeedInt *)impl->d_points_per_elem_owned));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Create transpose offsets and indices
 //------------------------------------------------------------------------------
-static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction rstr, const CeedInt *indices) {
+static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction rstr, const CeedInt elem_size, const CeedInt *indices) {
   Ceed                     ceed;
   bool                    *is_node;
   CeedSize                 l_size;
-  CeedInt                  num_elem, elem_size, num_comp, num_nodes = 0;
+  CeedInt                  num_elem, num_comp, num_nodes = 0;
   CeedInt                 *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices;
+  CeedRestrictionType      rstr_type;
   CeedElemRestriction_Hip *impl;
 
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
   const CeedInt size_indices = num_elem * elem_size;
@@ -485,6 +489,7 @@ static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction rstr, const C
   CeedCallBackend(CeedFree(&l_vec_indices));
   CeedCallBackend(CeedFree(&t_offsets));
   CeedCallBackend(CeedFree(&t_indices));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -495,16 +500,27 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
                                   const CeedInt8 *curl_orients, CeedElemRestriction rstr) {
   Ceed                     ceed, ceed_parent;
   bool                     is_deterministic;
-  CeedInt                  num_elem, elem_size;
+  CeedInt                  num_elem, num_comp, elem_size;
   CeedRestrictionType      rstr_type;
   CeedElemRestriction_Hip *impl;
 
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
   CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
+  CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
+  // Use max number of points as elem size for AtPoints restrictions
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedInt max_points = 0;
+
+    for (CeedInt i = 0; i < num_elem; i++) {
+      max_points = CeedIntMax(max_points, offsets[i + 1] - offsets[i]);
+    }
+    elem_size = max_points;
+  }
   const CeedInt size = num_elem * elem_size;
 
   CeedCallBackend(CeedCalloc(1, &impl));
@@ -525,6 +541,51 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
     }
   }
 
+  // Pad AtPoints indices
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedSize offsets_len = elem_size * num_elem, at_points_size = num_elem + 1;
+    CeedInt  max_points = elem_size, *offsets_padded, *points_per_elem;
+
+    CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "only MemType Host supported when creating AtPoints restriction");
+    CeedCallBackend(CeedMalloc(offsets_len, &offsets_padded));
+    CeedCallBackend(CeedMalloc(num_elem, &points_per_elem));
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points = offsets[i + 1] - offsets[i];
+      CeedInt last_point = 0;
+
+      points_per_elem[i] = num_points;
+      at_points_size += num_points;
+      // -- Copy all points in element
+      for (CeedInt j = 0; j < num_points; j++) {
+        offsets_padded[i * max_points + j] = offsets[offsets[i] + j] * num_comp;
+        last_point                         = offsets_padded[i * max_points + j];
+      }
+      // -- Replicate out last point in element
+      for (CeedInt j = num_points; j < max_points; j++) {
+        offsets_padded[i * max_points + j] = last_point;
+      }
+    }
+    CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, at_points_size, &impl->h_offsets_at_points_owned, &impl->h_offsets_at_points_borrowed,
+                                            &impl->h_offsets_at_points));
+    CeedCallHip(ceed, hipMalloc((void **)&impl->d_offsets_at_points_owned, at_points_size * sizeof(CeedInt)));
+    CeedCallHip(ceed, hipMemcpy((CeedInt **)impl->d_offsets_at_points_owned, impl->h_offsets_at_points, at_points_size * sizeof(CeedInt),
+                                hipMemcpyHostToDevice));
+    impl->d_offsets_at_points = (CeedInt *)impl->d_offsets_at_points_owned;
+
+    // -- Use padded offsets for the rest of the setup
+    offsets   = (const CeedInt *)offsets_padded;
+    copy_mode = CEED_OWN_POINTER;
+    CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, elem_size * num_elem * num_comp));
+
+    // -- Points per element
+    CeedCallBackend(CeedSetHostCeedIntArray(points_per_elem, CEED_OWN_POINTER, num_elem, &impl->h_points_per_elem_owned,
+                                            &impl->h_points_per_elem_borrowed, &impl->h_points_per_elem));
+    CeedCallHip(ceed, hipMalloc((void **)&impl->d_points_per_elem_owned, num_elem * sizeof(CeedInt)));
+    CeedCallHip(ceed,
+                hipMemcpy((CeedInt **)impl->d_points_per_elem_owned, impl->h_points_per_elem, num_elem * sizeof(CeedInt), hipMemcpyHostToDevice));
+    impl->d_points_per_elem = (CeedInt *)impl->d_points_per_elem_owned;
+  }
+
   // Set up device offset/orientation arrays
   if (rstr_type != CEED_RESTRICTION_STRIDED) {
     switch (mem_type) {
@@ -533,7 +594,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
         CeedCallHip(ceed, hipMalloc((void **)&impl->d_offsets_owned, size * sizeof(CeedInt)));
         CeedCallHip(ceed, hipMemcpy((CeedInt **)impl->d_offsets_owned, impl->h_offsets, size * sizeof(CeedInt), hipMemcpyHostToDevice));
         impl->d_offsets = (CeedInt *)impl->d_offsets_owned;
-        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, offsets));
+        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, elem_size, offsets));
       } break;
       case CEED_MEM_DEVICE: {
         CeedCallBackend(CeedSetDeviceCeedIntArray_Hip(ceed, offsets, copy_mode, size, &impl->d_offsets_owned, &impl->d_offsets_borrowed,
@@ -541,7 +602,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
         CeedCallBackend(CeedMalloc(size, &impl->h_offsets_owned));
         CeedCallHip(ceed, hipMemcpy((CeedInt **)impl->h_offsets_owned, impl->d_offsets, size * sizeof(CeedInt), hipMemcpyDeviceToHost));
         impl->h_offsets = impl->h_offsets_owned;
-        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, offsets));
+        if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, elem_size, offsets));
       } break;
     }
 
@@ -591,7 +652,12 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Hip));
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetAtPointsElementOffset",
+                                           CeedElemRestrictionGetAtPointsElementOffset_Hip));
+  }
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Hip));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c
index 5789679578..f1d1dcd93a 100644
--- a/backends/hip-ref/ceed-hip-ref-vector.c
+++ b/backends/hip-ref/ceed-hip-ref-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -39,15 +39,13 @@ static inline int CeedVectorNeedSync_Hip(const CeedVector vec, CeedMemType mem_t
 // Sync host to device
 //------------------------------------------------------------------------------
 static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) {
-  Ceed            ceed;
   CeedSize        length;
   size_t          bytes;
   CeedVector_Hip *impl;
 
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
 
-  CeedCheck(impl->h_array, ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device");
+  CeedCheck(impl->h_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid host data to sync to device");
 
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   bytes = length * sizeof(CeedScalar);
@@ -56,10 +54,10 @@ static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) {
   } else if (impl->d_array_owned) {
     impl->d_array = impl->d_array_owned;
   } else {
-    CeedCallHip(ceed, hipMalloc((void **)&impl->d_array_owned, bytes));
+    CeedCallHip(CeedVectorReturnCeed(vec), hipMalloc((void **)&impl->d_array_owned, bytes));
     impl->d_array = impl->d_array_owned;
   }
-  CeedCallHip(ceed, hipMemcpy(impl->d_array, impl->h_array, bytes, hipMemcpyHostToDevice));
+  CeedCallHip(CeedVectorReturnCeed(vec), hipMemcpy(impl->d_array, impl->h_array, bytes, hipMemcpyHostToDevice));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -67,15 +65,13 @@ static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) {
 // Sync device to host
 //------------------------------------------------------------------------------
 static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) {
-  Ceed            ceed;
   CeedSize        length;
   size_t          bytes;
   CeedVector_Hip *impl;
 
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
 
-  CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host");
+  CeedCheck(impl->d_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid device data to sync to host");
 
   if (impl->h_array_borrowed) {
     impl->h_array = impl->h_array_borrowed;
@@ -91,7 +87,7 @@ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) {
 
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   bytes = length * sizeof(CeedScalar);
-  CeedCallHip(ceed, hipMemcpy(impl->h_array, impl->d_array, bytes, hipMemcpyDeviceToHost));
+  CeedCallHip(CeedVectorReturnCeed(vec), hipMemcpy(impl->h_array, impl->d_array, bytes, hipMemcpyDeviceToHost));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -99,7 +95,15 @@ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) {
 // Sync arrays
 //------------------------------------------------------------------------------
 static int CeedVectorSyncArray_Hip(const CeedVector vec, CeedMemType mem_type) {
-  bool need_sync = false;
+  bool            need_sync = false;
+  CeedVector_Hip *impl;
+
+  // Sync for unified memory
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  if (impl->has_unified_addressing && !impl->h_array_borrowed) {
+    CeedCallHip(CeedVectorReturnCeed(vec), hipDeviceSynchronize());
+    return CEED_ERROR_SUCCESS;
+  }
 
   // Check whether device/host sync is needed
   CeedCallBackend(CeedVectorNeedSync_Hip(vec, mem_type, &need_sync));
@@ -111,7 +115,9 @@ static int CeedVectorSyncArray_Hip(const CeedVector vec, CeedMemType mem_type) {
     case CEED_MEM_DEVICE:
       return CeedVectorSyncH2D_Hip(vec);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -162,6 +168,10 @@ static inline int CeedVectorHasBorrowedArrayOfType_Hip(const CeedVector vec, Cee
   CeedVector_Hip *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
+
+  // Use device memory for unified memory
+  mem_type = impl->has_unified_addressing && !impl->h_array_borrowed ? CEED_MEM_DEVICE : mem_type;
+
   switch (mem_type) {
     case CEED_MEM_HOST:
       *has_borrowed_array_of_type = impl->h_array_borrowed;
@@ -202,6 +212,44 @@ static int CeedVectorSetArrayDevice_Hip(const CeedVector vec, const CeedCopyMode
 
   CeedCallBackend(CeedSetDeviceCeedScalarArray_Hip(ceed, array, copy_mode, length, (const CeedScalar **)&impl->d_array_owned,
                                                    (const CeedScalar **)&impl->d_array_borrowed, (const CeedScalar **)&impl->d_array));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Set array with unified memory
+//------------------------------------------------------------------------------
+static int CeedVectorSetArrayUnifiedHostToDevice_Hip(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) {
+  CeedSize        length;
+  Ceed            ceed;
+  CeedVector_Hip *impl;
+
+  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+
+  switch (copy_mode) {
+    case CEED_COPY_VALUES:
+    case CEED_OWN_POINTER:
+      if (!impl->d_array) {
+        if (impl->d_array_borrowed) {
+          impl->d_array = impl->d_array_borrowed;
+        } else {
+          if (!impl->d_array_owned) CeedCallHip(ceed, hipMalloc((void **)&impl->d_array_owned, sizeof(CeedScalar) * length));
+          impl->d_array = impl->d_array_owned;
+        }
+      }
+      if (array) CeedCallHip(ceed, hipMemcpy(impl->d_array, array, sizeof(CeedScalar) * length, hipMemcpyHostToDevice));
+      if (copy_mode == CEED_OWN_POINTER) CeedCallBackend(CeedFree(&array));
+      break;
+    case CEED_USE_POINTER:
+      CeedCallHip(ceed, hipFree(impl->d_array_owned));
+      CeedCallBackend(CeedFree(&impl->h_array_owned));
+      impl->h_array_owned    = NULL;
+      impl->h_array_borrowed = array;
+      impl->d_array          = impl->h_array_borrowed;
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -216,11 +264,83 @@ static int CeedVectorSetArray_Hip(const CeedVector vec, const CeedMemType mem_ty
   CeedCallBackend(CeedVectorSetAllInvalid_Hip(vec));
   switch (mem_type) {
     case CEED_MEM_HOST:
-      return CeedVectorSetArrayHost_Hip(vec, copy_mode, array);
+      if (impl->has_unified_addressing) {
+        return CeedVectorSetArrayUnifiedHostToDevice_Hip(vec, copy_mode, array);
+      } else {
+        return CeedVectorSetArrayHost_Hip(vec, copy_mode, array);
+      }
     case CEED_MEM_DEVICE:
       return CeedVectorSetArrayDevice_Hip(vec, copy_mode, array);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
+}
+
+//------------------------------------------------------------------------------
+// Copy host array to value strided
+//------------------------------------------------------------------------------
+static int CeedHostCopyStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *h_copy_array) {
+  for (CeedSize i = start; i < stop; i += step) h_copy_array[i] = h_array[i];
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Copy device array to value strided (impl in .hip.cpp file)
+//------------------------------------------------------------------------------
+int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *d_copy_array);
+
+//------------------------------------------------------------------------------
+// Copy a vector to a value strided
+//------------------------------------------------------------------------------
+static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedVector vec_copy) {
+  CeedSize        length;
+  CeedVector_Hip *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  {
+    CeedSize length_vec, length_copy;
+
+    CeedCallBackend(CeedVectorGetLength(vec, &length_vec));
+    CeedCallBackend(CeedVectorGetLength(vec_copy, &length_copy));
+    length = length_vec < length_copy ? length_vec : length_copy;
+  }
+  if (stop == -1) stop = length;
+  // Set value for synced device/host array
+  if (impl->d_array) {
+    CeedScalar *copy_array;
+    Ceed        ceed;
+
+    CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+    CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, &copy_array));
+#if (HIP_VERSION >= 60000000)
+    hipblasHandle_t handle;
+    hipStream_t     stream;
+    CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle));
+    CeedCallHipblas(ceed, hipblasGetStream(handle, &stream));
+#if defined(CEED_SCALAR_IS_FP32)
+    CeedCallHipblas(ceed, hipblasScopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
+#else  /* CEED_SCALAR */
+    CeedCallHipblas(ceed, hipblasDcopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step));
+#endif /* CEED_SCALAR */
+    CeedCallHip(ceed, hipStreamSynchronize(stream));
+#else  /* HIP_VERSION */
+    CeedCallBackend(CeedDeviceCopyStrided_Hip(impl->d_array, start, stop, step, copy_array));
+#endif /* HIP_VERSION */
+    CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
+    impl->h_array = NULL;
+    CeedCallBackend(CeedDestroy(&ceed));
+  } else if (impl->h_array) {
+    CeedScalar *copy_array;
+
+    CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &copy_array));
+    CeedCallBackend(CeedHostCopyStrided_Hip(impl->h_array, start, stop, step, copy_array));
+    CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
+    impl->d_array = NULL;
+  } else {
+    return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
+  }
+  return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
@@ -242,8 +362,10 @@ int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, CeedScalar val)
 static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) {
   CeedSize        length;
   CeedVector_Hip *impl;
+  Ceed_Hip       *hip_data;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedGetData(CeedVectorReturnCeed(vec), &hip_data));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   // Set value for synced device/host array
   if (!impl->d_array && !impl->h_array) {
@@ -260,16 +382,55 @@ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) {
     }
   }
   if (impl->d_array) {
-    CeedCallBackend(CeedDeviceSetValue_Hip(impl->d_array, length, val));
+    if (val == 0 && !impl->h_array_borrowed) {
+      CeedCallHip(CeedVectorReturnCeed(vec), hipMemset(impl->d_array, 0, length * sizeof(CeedScalar)));
+    } else {
+      CeedCallBackend(CeedDeviceSetValue_Hip(impl->d_array, length, val));
+    }
     impl->h_array = NULL;
-  }
-  if (impl->h_array) {
+  } else if (impl->h_array) {
     CeedCallBackend(CeedHostSetValue_Hip(impl->h_array, length, val));
     impl->d_array = NULL;
   }
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Set host array to value strided
+//------------------------------------------------------------------------------
+static int CeedHostSetValueStrided_Hip(CeedScalar *h_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
+  for (CeedSize i = start; i < stop; i += step) h_array[i] = val;
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Set device array to value strided (impl in .hip.cpp file)
+//------------------------------------------------------------------------------
+int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val);
+
+//------------------------------------------------------------------------------
+// Set a vector to a value strided
+//------------------------------------------------------------------------------
+static int CeedVectorSetValueStrided_Hip(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
+  CeedSize        length;
+  CeedVector_Hip *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+  // Set value for synced device/host array
+  if (stop == -1) stop = length;
+  if (impl->d_array) {
+    CeedCallBackend(CeedDeviceSetValueStrided_Hip(impl->d_array, start, stop, step, val));
+    impl->h_array = NULL;
+  } else if (impl->h_array) {
+    CeedCallBackend(CeedHostSetValueStrided_Hip(impl->h_array, start, stop, step, val));
+    impl->d_array = NULL;
+  } else {
+    return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Vector Take Array
 //------------------------------------------------------------------------------
@@ -298,14 +459,17 @@ static int CeedVectorTakeArray_Hip(CeedVector vec, CeedMemType mem_type, CeedSca
 }
 
 //------------------------------------------------------------------------------
-// Core logic for array syncronization for GetArray.
+// Core logic for array synchronization for GetArray.
 //   If a different memory type is most up to date, this will perform a copy
 //------------------------------------------------------------------------------
-static int CeedVectorGetArrayCore_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
+static int CeedVectorGetArrayCore_Hip(const CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
   CeedVector_Hip *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
 
+  // Use device memory for unified memory
+  mem_type = impl->has_unified_addressing && !impl->h_array_borrowed ? CEED_MEM_DEVICE : mem_type;
+
   // Sync array to requested mem_type
   CeedCallBackend(CeedVectorSyncArray(vec, mem_type));
 
@@ -331,15 +495,21 @@ static int CeedVectorGetArrayRead_Hip(const CeedVector vec, const CeedMemType me
 //------------------------------------------------------------------------------
 // Get read/write access to a vector via the specified mem_type
 //------------------------------------------------------------------------------
-static int CeedVectorGetArray_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
+static int CeedVectorGetArray_Hip(const CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
   CeedVector_Hip *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
+
+  // Use device memory for unified memory
+  mem_type = impl->has_unified_addressing && !impl->h_array_borrowed ? CEED_MEM_DEVICE : mem_type;
+
+  // 'Get' array and set only 'get'ed array as valid
   CeedCallBackend(CeedVectorGetArrayCore_Hip(vec, mem_type, array));
   CeedCallBackend(CeedVectorSetAllInvalid_Hip(vec));
   switch (mem_type) {
     case CEED_MEM_HOST:
       impl->h_array = *array;
+      if (impl->has_unified_addressing) impl->d_array = *array;
       break;
     case CEED_MEM_DEVICE:
       impl->d_array = *array;
@@ -351,11 +521,17 @@ static int CeedVectorGetArray_Hip(const CeedVector vec, const CeedMemType mem_ty
 //------------------------------------------------------------------------------
 // Get write access to a vector via the specified mem_type
 //------------------------------------------------------------------------------
-static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
+static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
   bool            has_array_of_type = true;
   CeedVector_Hip *impl;
+  Ceed_Hip       *hip_data;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedGetData(CeedVectorReturnCeed(vec), &hip_data));
+
+  // Use device memory for unified memory
+  mem_type = impl->has_unified_addressing && !impl->h_array_borrowed ? CEED_MEM_DEVICE : mem_type;
+
   CeedCallBackend(CeedVectorHasArrayOfType_Hip(vec, mem_type, &has_array_of_type));
   if (!has_array_of_type) {
     // Allocate if array is not yet allocated
@@ -379,119 +555,191 @@ static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, const CeedMemType m
 // Get the norm of a CeedVector
 //------------------------------------------------------------------------------
 static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *norm) {
-  Ceed              ceed;
-  CeedSize          length, num_calls;
+  Ceed     ceed;
+  CeedSize length;
+#if (HIP_VERSION < 60000000)
+  CeedSize num_calls;
+#endif /* HIP_VERSION */
   const CeedScalar *d_array;
   CeedVector_Hip   *impl;
   hipblasHandle_t   handle;
+  hipStream_t       stream;
+  Ceed_Hip         *hip_data;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &hip_data));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
   CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle));
-
-  // Is the vector too long to handle with int32? If so, we will divide
-  // it up into "int32-sized" subsections and make repeated BLAS calls.
+  CeedCallHipblas(ceed, hipblasGetStream(handle, &stream));
+#if (HIP_VERSION < 60000000)
+  // With ROCm 6, we can use the 64-bit integer interface. Prior to that,
+  // we need to check if the vector is too long to handle with int32,
+  // and if so, divide it into subsections for repeated hipBLAS calls.
   num_calls = length / INT_MAX;
   if (length % INT_MAX > 0) num_calls += 1;
+#endif /* HIP_VERSION */
 
   // Compute norm
   CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array));
   switch (type) {
     case CEED_NORM_1: {
       *norm = 0.0;
-      if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-        float  sub_norm = 0.0;
-        float *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallHipblas(ceed, hipblasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
-          *norm += sub_norm;
-        }
-      } else {
-        double  sub_norm = 0.0;
-        double *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallHipblas(ceed, hipblasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
-          *norm += sub_norm;
-        }
+#if defined(CEED_SCALAR_IS_FP32)
+#if (HIP_VERSION >= 60000000)  // We have ROCm 6, and can use 64-bit integers
+      CeedCallHipblas(ceed, hipblasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
+      CeedCallHip(ceed, hipStreamSynchronize(stream));
+#else  /* HIP_VERSION */
+      float  sub_norm = 0.0;
+      float *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallHipblas(ceed, hipblasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
+        CeedCallHip(ceed, hipStreamSynchronize(stream));
+        *norm += sub_norm;
+      }
+#endif /* HIP_VERSION */
+#else  /* CEED_SCALAR */
+#if (HIP_VERSION >= 60000000)
+      CeedCallHipblas(ceed, hipblasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
+      CeedCallHip(ceed, hipStreamSynchronize(stream));
+#else  /* HIP_VERSION */
+      double  sub_norm = 0.0;
+      double *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallHipblas(ceed, hipblasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
+        CeedCallHip(ceed, hipStreamSynchronize(stream));
+        *norm += sub_norm;
       }
+#endif /* HIP_VERSION */
+#endif /* CEED_SCALAR */
       break;
     }
     case CEED_NORM_2: {
-      if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-        float  sub_norm = 0.0, norm_sum = 0.0;
-        float *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallHipblas(ceed, hipblasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
-          norm_sum += sub_norm * sub_norm;
-        }
-        *norm = sqrt(norm_sum);
-      } else {
-        double  sub_norm = 0.0, norm_sum = 0.0;
-        double *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallHipblas(ceed, hipblasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
-          norm_sum += sub_norm * sub_norm;
-        }
-        *norm = sqrt(norm_sum);
+#if defined(CEED_SCALAR_IS_FP32)
+#if (HIP_VERSION >= 60000000)
+      CeedCallHipblas(ceed, hipblasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
+      CeedCallHip(ceed, hipStreamSynchronize(stream));
+#else  /* HIP_VERSION */
+      float  sub_norm = 0.0, norm_sum = 0.0;
+      float *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallHipblas(ceed, hipblasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
+        CeedCallHip(ceed, hipStreamSynchronize(stream));
+        norm_sum += sub_norm * sub_norm;
+      }
+      *norm = sqrt(norm_sum);
+#endif /* HIP_VERSION */
+#else  /* CEED_SCALAR */
+#if (HIP_VERSION >= 60000000)
+      CeedCallHipblas(ceed, hipblasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
+      CeedCallHip(ceed, hipStreamSynchronize(stream));
+#else  /* HIP_VERSION */
+      double  sub_norm = 0.0, norm_sum = 0.0;
+      double *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallHipblas(ceed, hipblasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
+        CeedCallHip(ceed, hipStreamSynchronize(stream));
+        norm_sum += sub_norm * sub_norm;
       }
+      *norm = sqrt(norm_sum);
+#endif /* HIP_VERSION */
+#endif /* CEED_SCALAR */
       break;
     }
     case CEED_NORM_MAX: {
+#if defined(CEED_SCALAR_IS_FP32)
+#if (HIP_VERSION >= 60000000)
+      int64_t    index;
+      CeedScalar norm_no_abs;
+
+      CeedCallHipblas(ceed, hipblasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
+      CeedCallHip(ceed, hipMemcpyAsync(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream));
+      CeedCallHip(ceed, hipStreamSynchronize(stream));
+      *norm = fabs(norm_no_abs);
+#else  /* HIP_VERSION */
       CeedInt index;
-
-      if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
-        float  sub_max = 0.0, current_max = 0.0;
-        float *d_array_start;
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallHipblas(ceed, hipblasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
-          CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
-          if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
+      float   sub_max = 0.0, current_max = 0.0;
+      float  *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallHipblas(ceed, hipblasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
+        if (hip_data->has_unified_addressing) {
+          CeedCallHip(ceed, hipStreamSynchronize(stream));
+          sub_max = fabs(d_array[index - 1]);
+        } else {
+          CeedCallHip(ceed, hipMemcpyAsync(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream));
+          CeedCallHip(ceed, hipStreamSynchronize(stream));
         }
-        *norm = current_max;
+        if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
+      }
+      *norm = current_max;
+#endif /* HIP_VERSION */
+#else  /* CEED_SCALAR */
+#if (HIP_VERSION >= 60000000)
+      int64_t    index;
+      CeedScalar norm_no_abs;
+
+      CeedCallHipblas(ceed, hipblasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
+      if (hip_data->has_unified_addressing) {
+        CeedCallHip(ceed, hipStreamSynchronize(stream));
+        norm_no_abs = fabs(d_array[index - 1]);
       } else {
-        double  sub_max = 0.0, current_max = 0.0;
-        double *d_array_start;
-
-        for (CeedInt i = 0; i < num_calls; i++) {
-          d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
-          CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
-          CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
-
-          CeedCallHipblas(ceed, hipblasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
-          CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost));
-          if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
+        CeedCallHip(ceed, hipMemcpyAsync(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream));
+        CeedCallHip(ceed, hipStreamSynchronize(stream));
+      }
+      *norm = fabs(norm_no_abs);
+#else  /* HIP_VERSION */
+      CeedInt index;
+      double  sub_max = 0.0, current_max = 0.0;
+      double *d_array_start;
+
+      for (CeedInt i = 0; i < num_calls; i++) {
+        d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
+        CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
+        CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
+
+        CeedCallHipblas(ceed, hipblasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
+        if (hip_data->has_unified_addressing) {
+          CeedCallHip(ceed, hipStreamSynchronize(stream));
+          sub_max = fabs(d_array[index - 1]);
+        } else {
+          CeedCallHip(ceed, hipMemcpyAsync(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream));
+          CeedCallHip(ceed, hipStreamSynchronize(stream));
         }
-        *norm = current_max;
+        if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
       }
+      *norm = current_max;
+#endif /* HIP_VERSION */
+#endif /* CEED_SCALAR */
       break;
     }
   }
   CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -506,7 +754,7 @@ static int CeedHostReciprocal_Hip(CeedScalar *h_array, CeedSize length) {
 }
 
 //------------------------------------------------------------------------------
-// Take reciprocal of a vector on device (impl in .cu file)
+// Take reciprocal of a vector on device (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
 int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length);
 
@@ -534,7 +782,7 @@ static int CeedHostScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize len
 }
 
 //------------------------------------------------------------------------------
-// Compute x = alpha x on device (impl in .cu file)
+// Compute x = alpha x on device (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
 int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length);
 
@@ -543,13 +791,33 @@ int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length);
 //------------------------------------------------------------------------------
 static int CeedVectorScale_Hip(CeedVector x, CeedScalar alpha) {
   CeedSize        length;
-  CeedVector_Hip *x_impl;
+  CeedVector_Hip *impl;
 
-  CeedCallBackend(CeedVectorGetData(x, &x_impl));
+  CeedCallBackend(CeedVectorGetData(x, &impl));
   CeedCallBackend(CeedVectorGetLength(x, &length));
   // Set value for synced device/host array
-  if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Hip(x_impl->d_array, alpha, length));
-  if (x_impl->h_array) CeedCallBackend(CeedHostScale_Hip(x_impl->h_array, alpha, length));
+  if (impl->d_array) {
+#if (HIP_VERSION >= 60000000)
+    hipblasHandle_t handle;
+    hipStream_t     stream;
+
+    CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(x), &handle));
+    CeedCallHipblas(CeedVectorReturnCeed(x), hipblasGetStream(handle, &stream));
+#if defined(CEED_SCALAR_IS_FP32)
+    CeedCallHipblas(CeedVectorReturnCeed(x), hipblasSscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
+#else  /* CEED_SCALAR */
+    CeedCallHipblas(CeedVectorReturnCeed(x), hipblasDscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1));
+#endif /* CEED_SCALAR */
+    CeedCallHip(CeedVectorReturnCeed(x), hipStreamSynchronize(stream));
+#else  /* HIP_VERSION */
+    CeedCallBackend(CeedDeviceScale_Hip(impl->d_array, alpha, length));
+#endif /* HIP_VERSION */
+    impl->h_array = NULL;
+  }
+  if (impl->h_array) {
+    CeedCallBackend(CeedHostScale_Hip(impl->h_array, alpha, length));
+    impl->d_array = NULL;
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -562,7 +830,7 @@ static int CeedHostAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x
 }
 
 //------------------------------------------------------------------------------
-// Compute y = alpha x + y on device (impl in .cu file)
+// Compute y = alpha x + y on device (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
 int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length);
 
@@ -579,11 +847,26 @@ static int CeedVectorAXPY_Hip(CeedVector y, CeedScalar alpha, CeedVector x) {
   // Set value for synced device/host array
   if (y_impl->d_array) {
     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
+#if (HIP_VERSION >= 60000000)
+    hipblasHandle_t handle;
+    hipStream_t     stream;
+
+    CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(x), &handle));
+    CeedCallHipblas(CeedVectorReturnCeed(y), hipblasGetStream(handle, &stream));
+#if defined(CEED_SCALAR_IS_FP32)
+    CeedCallHipblas(CeedVectorReturnCeed(y), hipblasSaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
+#else  /* CEED_SCALAR */
+    CeedCallHipblas(CeedVectorReturnCeed(y), hipblasDaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1));
+#endif /* CEED_SCALAR */
+    CeedCallHip(CeedVectorReturnCeed(y), hipStreamSynchronize(stream));
+#else  /* HIP_VERSION */
     CeedCallBackend(CeedDeviceAXPY_Hip(y_impl->d_array, alpha, x_impl->d_array, length));
-  }
-  if (y_impl->h_array) {
+#endif /* HIP_VERSION */
+    y_impl->h_array = NULL;
+  } else if (y_impl->h_array) {
     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
     CeedCallBackend(CeedHostAXPY_Hip(y_impl->h_array, alpha, x_impl->h_array, length));
+    y_impl->d_array = NULL;
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -597,7 +880,7 @@ static int CeedHostAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar b
 }
 
 //------------------------------------------------------------------------------
-// Compute y = alpha x + beta y on device (impl in .cu file)
+// Compute y = alpha x + beta y on device (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
 int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length);
 
@@ -632,7 +915,7 @@ static int CeedHostPointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, C
 }
 
 //------------------------------------------------------------------------------
-// Compute the pointwise multiplication w = x .* y on device (impl in .cu file)
+// Compute the pointwise multiplication w = x .* y on device (impl in .hip.cpp file)
 //------------------------------------------------------------------------------
 int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length);
 
@@ -683,6 +966,7 @@ static int CeedVectorDestroy_Hip(const CeedVector vec) {
 //------------------------------------------------------------------------------
 int CeedVectorCreate_Hip(CeedSize n, CeedVector vec) {
   CeedVector_Hip *impl;
+  Ceed_Hip       *hip_impl;
   Ceed            ceed;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
@@ -690,19 +974,24 @@ int CeedVectorCreate_Hip(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Hip));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "CopyStrided", CeedVectorCopyStrided_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", CeedVectorSetValue_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", CeedVectorSetValueStrided_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Hip));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())CeedVectorScale_Hip));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())CeedVectorAXPY_Hip));
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", CeedVectorScale_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", CeedVectorAXPY_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Hip));
   CeedCallBackend(CeedCalloc(1, &impl));
+  CeedCallBackend(CeedGetData(ceed, &hip_impl));
+  CeedCallBackend(CeedDestroy(&ceed));
+  impl->has_unified_addressing = hip_impl->has_unified_addressing;
   CeedCallBackend(CeedVectorSetData(vec, impl));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-ref/ceed-hip-ref.c b/backends/hip-ref/ceed-hip-ref.c
index 8494d127e3..f22f3a16e7 100644
--- a/backends/hip-ref/ceed-hip-ref.c
+++ b/backends/hip-ref/ceed-hip-ref.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -29,7 +29,10 @@ int CeedGetHipblasHandle_Hip(Ceed ceed, hipblasHandle_t *handle) {
   Ceed_Hip *data;
 
   CeedCallBackend(CeedGetData(ceed, &data));
-  if (!data->hipblas_handle) CeedCallHipblas(ceed, hipblasCreate(&data->hipblas_handle));
+  if (!data->hipblas_handle) {
+    CeedCallHipblas(ceed, hipblasCreate(&data->hipblas_handle));
+    CeedCallHipblas(ceed, hipblasSetPointerMode(data->hipblas_handle, HIPBLAS_POINTER_MODE_HOST));
+  }
   *handle = data->hipblas_handle;
   return CEED_ERROR_SUCCESS;
 }
@@ -57,9 +60,11 @@ static int CeedInit_Hip_ref(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreateAtPoints", CeedElemRestrictionCreate_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreateAtPoints_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 815790c53c..2e7ee88313 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -17,6 +17,7 @@
 #endif
 
 typedef struct {
+  int         has_unified_addressing;
   CeedScalar *h_array;
   CeedScalar *h_array_borrowed;
   CeedScalar *h_array_owned;
@@ -52,6 +53,18 @@ typedef struct {
   const CeedInt8 *d_curl_orients;
   const CeedInt8 *d_curl_orients_borrowed;
   const CeedInt8 *d_curl_orients_owned;
+  const CeedInt  *h_offsets_at_points;
+  const CeedInt  *h_offsets_at_points_borrowed;
+  const CeedInt  *h_offsets_at_points_owned;
+  const CeedInt  *d_offsets_at_points;
+  const CeedInt  *d_offsets_at_points_borrowed;
+  const CeedInt  *d_offsets_at_points_owned;
+  const CeedInt  *h_points_per_elem;
+  const CeedInt  *h_points_per_elem_borrowed;
+  const CeedInt  *h_points_per_elem_owned;
+  const CeedInt  *d_points_per_elem;
+  const CeedInt  *d_points_per_elem_borrowed;
+  const CeedInt  *d_points_per_elem_owned;
 } CeedElemRestriction_Hip;
 
 typedef struct {
@@ -59,9 +72,19 @@ typedef struct {
   hipFunction_t Interp;
   hipFunction_t Grad;
   hipFunction_t Weight;
+  hipModule_t   moduleAtPoints;
+  CeedInt       num_points;
+  hipFunction_t InterpAtPoints;
+  hipFunction_t InterpTransposeAtPoints;
+  hipFunction_t GradAtPoints;
+  hipFunction_t GradTransposeAtPoints;
   CeedScalar   *d_interp_1d;
   CeedScalar   *d_grad_1d;
   CeedScalar   *d_q_weight_1d;
+  CeedScalar   *d_chebyshev_interp_1d;
+  CeedInt       num_elem_at_points;
+  CeedInt      *h_points_per_elem;
+  CeedInt      *d_points_per_elem;
 } CeedBasis_Hip;
 
 typedef struct {
@@ -81,7 +104,6 @@ typedef struct {
 typedef struct {
   hipModule_t   module;
   const char   *qfunction_name;
-  const char   *qfunction_source;
   hipFunction_t QFunction;
   Fields_Hip    fields;
   void         *d_c;
@@ -115,12 +137,17 @@ typedef struct {
 } CeedOperatorAssemble_Hip;
 
 typedef struct {
-  CeedVector               *e_vecs;      // E-vectors, inputs followed by outputs
-  CeedVector               *q_vecs_in;   // Input Q-vectors needed to apply operator
-  CeedVector               *q_vecs_out;  // Output Q-vectors needed to apply operator
+  bool                     *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
+  uint64_t                 *input_states, points_state;  // State tracking for passive inputs
+  CeedVector               *e_vecs_in, *e_vecs_out;
+  CeedVector               *q_vecs_in, *q_vecs_out;
   CeedInt                   num_inputs, num_outputs;
   CeedInt                   num_active_in, num_active_out;
-  CeedVector               *qf_active_in;
+  CeedInt                  *input_field_order, *output_field_order;
+  CeedSize                  max_active_e_vec_len;
+  CeedInt                   max_num_points;
+  CeedInt                  *num_points;
+  CeedVector               *qf_active_in, point_coords_elem;
   CeedOperatorDiag_Hip     *diag;
   CeedOperatorAssemble_Hip *asmb;
 } CeedOperator_Hip;
@@ -146,3 +173,4 @@ CEED_INTERN int CeedQFunctionCreate_Hip(CeedQFunction qf);
 CEED_INTERN int CeedQFunctionContextCreate_Hip(CeedQFunctionContext ctx);
 
 CEED_INTERN int CeedOperatorCreate_Hip(CeedOperator op);
+CEED_INTERN int CeedOperatorCreateAtPoints_Hip(CeedOperator op);
diff --git a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
index 5f6dd15f2a..b9f81032b5 100644
--- a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
+++ b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,14 +8,37 @@
 #include <ceed.h>
 #include <hip/hip_runtime.h>
 
+//------------------------------------------------------------------------------
+// Kernel for copy strided on device
+//------------------------------------------------------------------------------
+__global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) {
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < size) {
+    if ((index - start) % step == 0) vec_copy[index] = vec[index];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Copy strided on device memory
+//------------------------------------------------------------------------------
+extern "C" int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) {
+  const int      block_size = 512;
+  const CeedSize vec_size   = length;
+  int            grid_size  = vec_size / block_size;
+
+  if (block_size * grid_size < vec_size) grid_size += 1;
+  hipLaunchKernelGGL(copyStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, d_copy_array);
+  return 0;
+}
+
 //------------------------------------------------------------------------------
 // Kernel for set value on device
 //------------------------------------------------------------------------------
 __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  vec[index] = val;
+  if (index < size) vec[index] = val;
 }
 
 //------------------------------------------------------------------------------
@@ -31,14 +54,39 @@ extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, Ceed
   return 0;
 }
 
+//------------------------------------------------------------------------------
+// Kernel for set value strided on device
+//------------------------------------------------------------------------------
+__global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+
+  if (index < stop - start) {
+    if (index % step == 0) vec[start + index] = val;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Set value strided on device memory
+//------------------------------------------------------------------------------
+extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedInt stop, CeedSize step, CeedSize length, CeedScalar val) {
+  const int      block_size = 512;
+  const CeedSize set_size   = stop - start;
+  int            grid_size  = set_size / block_size;
+
+  if (block_size * grid_size < set_size) grid_size += 1;
+  hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, stop, step, val);
+  return 0;
+}
+
 //------------------------------------------------------------------------------
 // Kernel for taking reciprocal
 //------------------------------------------------------------------------------
 __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
+  if (index < size) {
+    if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -58,10 +106,9 @@ extern "C" int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length) {
 // Kernel for scale
 //------------------------------------------------------------------------------
 __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  x[index] *= alpha;
+  if (index < size) x[index] *= alpha;
 }
 
 //------------------------------------------------------------------------------
@@ -81,10 +128,9 @@ extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSi
 // Kernel for axpy
 //------------------------------------------------------------------------------
 __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  y[index] += alpha * x[index];
+  if (index < size) y[index] += alpha * x[index];
 }
 
 //------------------------------------------------------------------------------
@@ -104,11 +150,12 @@ extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedSca
 // Kernel for axpby
 //------------------------------------------------------------------------------
 __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  y[index] = beta * y[index];
-  y[index] += alpha * x[index];
+  if (index < size) {
+    y[index] = beta * y[index];
+    y[index] += alpha * x[index];
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -128,10 +175,9 @@ extern "C" int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedSc
 // Kernel for pointwise mult
 //------------------------------------------------------------------------------
 __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) {
-  CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
+  const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
 
-  if (index >= size) return;
-  w[index] = x[index] * y[index];
+  if (index < size) w[index] = x[index] * y[index];
 }
 
 //------------------------------------------------------------------------------
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 7642043d4c..3fb4c93630 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -10,6 +10,7 @@
 #include <ceed/jit-tools.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <string.h>
 #include <hip/hip_runtime.h>
 
 #include "../hip/ceed-hip-common.h"
@@ -87,8 +88,8 @@ static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, c
 //------------------------------------------------------------------------------
 // Apply basis
 //------------------------------------------------------------------------------
-int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                    CeedVector v) {
+static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
+                                               CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
   Ceed                  ceed;
   Ceed_Hip             *ceed_Hip;
   CeedInt               dim, num_comp;
@@ -105,7 +106,11 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
   // Get read/write access to u, v
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  }
 
   // Apply basis operation
   switch (eval_mode) {
@@ -113,6 +118,7 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
       CeedInt P_1d, Q_1d;
       CeedInt block_size = data->block_sizes[0];
 
+      CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       CeedInt thread_1d     = CeedIntMax(Q_1d, P_1d);
@@ -121,36 +127,37 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
       if (dim == 1) {
         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1,
+                                                     elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
         }
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,
+                                                     thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
       } else if (dim == 3) {
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(
-              CeedRunKernelDimShared_Hip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,
+                                                     thread_1d, elems_per_block, shared_mem, interp_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
         }
       }
     } break;
@@ -158,6 +165,7 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
       CeedInt P_1d, Q_1d;
       CeedInt block_size = data->block_sizes[1];
 
+      CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       CeedInt     thread_1d = CeedIntMax(Q_1d, P_1d);
@@ -167,37 +175,41 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
         d_grad_1d = data->d_collo_grad_1d;
       }
       void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v};
+
       if (dim == 1) {
         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 1,
+                                                     elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
         }
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, thread_1d,
+                                                     elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
       } else if (dim == 3) {
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, thread_1d,
+                                                     elems_per_block, shared_mem, grad_args));
         } else {
-          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
         }
       }
     } break;
@@ -205,25 +217,26 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
       CeedInt Q_1d;
       CeedInt block_size = data->block_sizes[2];
 
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
 
       if (dim == 1) {
         const CeedInt opt_elems       = block_size / Q_1d;
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args));
       } else if (dim == 2) {
         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
       } else if (dim == 3) {
         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
       }
@@ -241,6 +254,375 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                    CeedVector v) {
+  CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedBasisApplyAddTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                       CeedVector v) {
+  CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Basis apply - tensor AtPoints
+//------------------------------------------------------------------------------
+static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
+                                                 CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  Ceed                  ceed;
+  CeedInt               Q_1d, dim, max_num_points = num_points[0];
+  const CeedInt         is_transpose = t_mode == CEED_TRANSPOSE;
+  const CeedScalar     *d_x, *d_u;
+  CeedScalar           *d_v;
+  CeedBasis_Hip_shared *data;
+
+  CeedCallBackend(CeedBasisGetData(basis, &data));
+  CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+
+  // Weight handled separately
+  if (eval_mode == CEED_EVAL_WEIGHT) {
+    CeedCallBackend(CeedVectorSetValue(v, 1.0));
+    return CEED_ERROR_SUCCESS;
+  }
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
+      CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
+    }
+  }
+
+  // Build kernels if needed
+  if (data->num_points != max_num_points) {
+    CeedInt P_1d;
+
+    CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+    data->num_points = max_num_points;
+
+    // -- Create interp matrix to Chebyshev coefficients
+    if (!data->d_chebyshev_interp_1d) {
+      CeedSize    interp_bytes;
+      CeedScalar *chebyshev_interp_1d;
+
+      interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+      CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+      CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+      CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
+      CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
+      CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+    }
+
+    // -- Compile kernels
+    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h>\n";
+    CeedInt    num_comp;
+
+    if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D",
+                                    CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
+                                    "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "BASIS_INTERP_BLOCK_SIZE",
+                                    data->block_sizes[0]));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAddAtPoints", &data->InterpTransposeAddAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
+    CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAddAtPoints", &data->GradTransposeAddAtPoints));
+  }
+
+  // Get read/write access to u, v
+  CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  }
+
+  // Basis action
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      CeedInt P_1d, Q_1d;
+      CeedInt block_size = data->block_sizes[0];
+
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      CeedInt thread_1d     = CeedIntMax(Q_1d, P_1d);
+      void   *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+
+      if (dim == 1) {
+        CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
+
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
+                                                     thread_1d, 1, elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
+        }
+      } else if (dim == 2) {
+        // Check if required threads is small enough to do multiple elems
+        const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
+                                                     thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                     interp_args));
+        }
+      } else if (dim == 3) {
+        const CeedInt elems_per_block = 1;
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid,
+                                                     thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                     interp_args));
+        }
+      }
+    } break;
+    case CEED_EVAL_GRAD: {
+      CeedInt P_1d, Q_1d;
+      CeedInt block_size = data->block_sizes[0];
+
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      CeedInt thread_1d   = CeedIntMax(Q_1d, P_1d);
+      void   *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+
+      if (dim == 1) {
+        CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
+
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
+                                                     thread_1d, 1, elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
+        }
+      } else if (dim == 2) {
+        // Check if required threads is small enough to do multiple elems
+        const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
+                                                     thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                     grad_args));
+        }
+      } else if (dim == 3) {
+        const CeedInt elems_per_block = 1;
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        if (is_transpose) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid,
+                                                     thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,
+                                                     grad_args));
+        }
+      }
+    } break;
+    case CEED_EVAL_WEIGHT:
+    case CEED_EVAL_NONE: /* handled separately below */
+      break;
+    // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+
+  // Restore vectors, cover CEED_EVAL_NONE
+  CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
+  if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                             CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                                CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Apply basis
+//------------------------------------------------------------------------------
+static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
+                                                  CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  Ceed                  ceed;
+  Ceed_Hip             *ceed_Hip;
+  CeedInt               dim, num_comp;
+  const CeedScalar     *d_u;
+  CeedScalar           *d_v;
+  CeedBasis_Hip_shared *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &ceed_Hip));
+  CeedCallBackend(CeedBasisGetData(basis, &data));
+  CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+
+  // Get read/write access to u, v
+  if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
+  if (apply_add) {
+    CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  } else {
+    CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  }
+
+  // Apply basis operation
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      CeedInt P, Q;
+
+      CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEvalModes[eval_mode]);
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      CeedInt thread        = CeedIntMax(Q, P);
+      void   *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};
+
+      {
+        CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
+
+        if (t_mode == CEED_TRANSPOSE) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1,
+                                                     elems_per_block, shared_mem, interp_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread, 1, elems_per_block, shared_mem, interp_args));
+        }
+      }
+    } break;
+    case CEED_EVAL_GRAD: {
+      CeedInt P, Q;
+
+      CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalModes[eval_mode]);
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      CeedInt thread      = CeedIntMax(Q, P);
+      void   *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v};
+
+      {
+        CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
+
+        if (t_mode == CEED_TRANSPOSE) {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1,
+                                                     elems_per_block, shared_mem, grad_args));
+        } else {
+          CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread, 1, elems_per_block, shared_mem, grad_args));
+        }
+      }
+    } break;
+    case CEED_EVAL_WEIGHT: {
+      CeedInt P, Q;
+
+      CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
+      CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
+      CeedInt thread        = CeedIntMax(Q, P);
+      void   *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
+
+      {
+        CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+
+        CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, thread, elems_per_block, 1, weight_args));
+      }
+    } break;
+    case CEED_EVAL_NONE: /* handled separately below */
+      break;
+    // LCOV_EXCL_START
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+
+  // Restore vectors, cover CEED_EVAL_NONE
+  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
+  if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedBasisApplyNonTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                       CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedBasisApplyAddNonTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
+                                          CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -254,11 +636,16 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallHip(ceed, hipModuleUnload(data->module));
-  CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
+  if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
+  if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
   CeedCallHip(ceed, hipFree(data->d_interp_1d));
   CeedCallHip(ceed, hipFree(data->d_grad_1d));
   CeedCallHip(ceed, hipFree(data->d_collo_grad_1d));
+  CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
   CeedCallBackend(CeedFree(&data));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -268,8 +655,6 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
 int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                        const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
   Ceed                  ceed;
-  char                 *basis_kernel_source;
-  const char           *basis_kernel_path;
   CeedInt               num_comp;
   const CeedInt         q_bytes      = Q_1d * sizeof(CeedScalar);
   const CeedInt         interp_bytes = q_bytes * P_1d;
@@ -279,8 +664,10 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(CeedCalloc(1, &data));
 
   // Copy basis data to GPU
-  CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
-  CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
+  if (q_weight_1d) {
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
+  }
   CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
   CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice));
   CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes));
@@ -305,28 +692,106 @@ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, data->block_sizes));
 
   // Compile basis kernels
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
-  CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
+  bool       is_collocated         = false;
+  const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor.h>\n";
+
+  CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D",
                                   CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
                                   "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], "BASIS_GRAD_BLOCK_SIZE",
                                   data->block_sizes[1], "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2], "BASIS_HAS_COLLOCATED_GRAD",
                                   has_collocated_grad));
+  CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocated" : "Interp", &data->Interp));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocatedTranspose" : "InterpTranspose", &data->InterpTranspose));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "InterpCollocatedTransposeAdd" : "InterpTransposeAdd",
+                                    &data->InterpTransposeAdd));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocated" : "Grad", &data->Grad));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocatedTranspose" : "GradTranspose", &data->GradTranspose));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, is_collocated ? "GradCollocatedTransposeAdd" : "GradTransposeAdd", &data->GradTransposeAdd));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
+
+  CeedCallBackend(CeedBasisSetData(basis, data));
+
+  // Register backend functions
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Create non-tensor basis
+//------------------------------------------------------------------------------
+int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                                 const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
+  Ceed                  ceed;
+  CeedInt               num_comp, q_comp_interp, q_comp_grad;
+  const CeedInt         q_bytes = num_qpts * sizeof(CeedScalar);
+  CeedBasis_Hip_shared *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+
+  // Check shared memory size
+  {
+    Ceed_Hip *hip_data;
+
+    CeedCallBackend(CeedGetData(ceed, &hip_data));
+    if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) >
+        hip_data->device_prop.sharedMemPerBlock) {
+      CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis));
+      CeedCallBackend(CeedDestroy(&ceed));
+      return CEED_ERROR_SUCCESS;
+    }
+  }
+
+  CeedCallBackend(CeedCalloc(1, &data));
+
+  // Copy basis data to GPU
+  CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
+  CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
+  if (q_weight) {
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight, q_bytes, hipMemcpyHostToDevice));
+  }
+  if (interp) {
+    const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
+
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp, interp_bytes, hipMemcpyHostToDevice));
+  }
+  if (grad) {
+    const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad;
+
+    CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, grad_bytes));
+    CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad, grad_bytes, hipMemcpyHostToDevice));
+  }
+
+  // Compile basis kernels
+  const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-nontensor.h>\n";
+
+  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+  CeedCallBackend(ComputeBasisThreadBlockSizes(dim, num_nodes, num_qpts, num_comp, data->block_sizes));
+  CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 6, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_T_1D",
+                                  CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_INTERP_BLOCK_SIZE",
+                                  data->block_sizes[0]));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose));
+  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
-  CeedCallBackend(CeedFree(&basis_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
 
   CeedCallBackend(CeedBasisSetData(basis, data));
 
   // Register backend functions
-  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-shared/ceed-hip-shared.c b/backends/hip-shared/ceed-hip-shared.c
index f69c78d4ee..afb39e8bde 100644
--- a/backends/hip-shared/ceed-hip-shared.c
+++ b/backends/hip-shared/ceed-hip-shared.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -33,8 +33,10 @@ static int CeedInit_Hip_shared(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedInit("/gpu/hip/ref", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Hip_shared));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Hip_shared));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h
index 6a7c99d048..c534b85e33 100644
--- a/backends/hip-shared/ceed-hip-shared.h
+++ b/backends/hip-shared/ceed-hip-shared.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -14,15 +14,32 @@ typedef struct {
   hipModule_t   module;
   hipFunction_t Interp;
   hipFunction_t InterpTranspose;
+  hipFunction_t InterpTransposeAdd;
   hipFunction_t Grad;
   hipFunction_t GradTranspose;
+  hipFunction_t GradTransposeAdd;
   hipFunction_t Weight;
+  hipModule_t   moduleAtPoints;
+  CeedInt       num_points;
+  hipFunction_t InterpAtPoints;
+  hipFunction_t InterpTransposeAtPoints;
+  hipFunction_t InterpTransposeAddAtPoints;
+  hipFunction_t GradAtPoints;
+  hipFunction_t GradTransposeAtPoints;
+  hipFunction_t GradTransposeAddAtPoints;
   CeedInt       block_sizes[3];  // interp, grad, weight thread block sizes
   CeedScalar   *d_interp_1d;
   CeedScalar   *d_grad_1d;
   CeedScalar   *d_collo_grad_1d;
   CeedScalar   *d_q_weight_1d;
+  CeedScalar   *d_chebyshev_interp_1d;
+  CeedInt       num_elem_at_points;
+  CeedInt      *h_points_per_elem;
+  CeedInt      *d_points_per_elem;
 } CeedBasis_Hip_shared;
 
 CEED_INTERN int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                                    const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis);
+
+CEED_INTERN int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                                             const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
diff --git a/backends/hip/ceed-hip-common.c b/backends/hip/ceed-hip-common.c
index 597aee9037..c33f13b766 100644
--- a/backends/hip/ceed-hip-common.c
+++ b/backends/hip/ceed-hip-common.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -19,7 +19,8 @@ int CeedInit_Hip(Ceed ceed, const char *resource) {
   Ceed_Hip   *data;
   const char *device_spec = strstr(resource, ":device_id=");
   const int   device_id   = (device_spec) ? atoi(device_spec + 11) : -1;
-  int         current_device_id;
+  int         current_device_id, xnack_value;
+  const char *xnack;
 
   CeedCallHip(ceed, hipGetDevice(&current_device_id));
   if (device_id >= 0 && current_device_id != device_id) {
@@ -30,6 +31,12 @@ int CeedInit_Hip(Ceed ceed, const char *resource) {
   CeedCallBackend(CeedGetData(ceed, &data));
   data->device_id = current_device_id;
   CeedCallHip(ceed, hipGetDeviceProperties(&data->device_prop, current_device_id));
+  xnack                        = getenv("HSA_XNACK");
+  xnack_value                  = !!xnack ? atol(xnack) : 0;
+  data->has_unified_addressing = xnack_value > 0 ? data->device_prop.unifiedAddressing : 0;
+  if (data->has_unified_addressing) {
+    CeedDebug(ceed, "Using unified memory addressing");
+  }
   data->opt_block_size = 256;
   return CEED_ERROR_SUCCESS;
 }
@@ -53,10 +60,15 @@ static inline int CeedSetDeviceGenericArray_Hip(Ceed ceed, const void *source_ar
                                                 void *target_array_owned, void *target_array_borrowed, void *target_array) {
   switch (copy_mode) {
     case CEED_COPY_VALUES:
-      if (!*(void **)target_array_owned) CeedCallHip(ceed, hipMalloc(target_array_owned, size_unit * num_values));
-      if (source_array) CeedCallHip(ceed, hipMemcpy(*(void **)target_array_owned, source_array, size_unit * num_values, hipMemcpyDeviceToDevice));
-      *(void **)target_array_borrowed = NULL;
-      *(void **)target_array          = *(void **)target_array_owned;
+      if (!*(void **)target_array) {
+        if (*(void **)target_array_borrowed) {
+          *(void **)target_array = *(void **)target_array_borrowed;
+        } else {
+          if (!*(void **)target_array_owned) CeedCallHip(ceed, hipMalloc(target_array_owned, size_unit * num_values));
+          *(void **)target_array = *(void **)target_array_owned;
+        }
+      }
+      if (source_array) CeedCallHip(ceed, hipMemcpy(*(void **)target_array, source_array, size_unit * num_values, hipMemcpyDeviceToDevice));
       break;
     case CEED_OWN_POINTER:
       CeedCallHip(ceed, hipFree(*(void **)target_array_owned));
diff --git a/backends/hip/ceed-hip-common.h b/backends/hip/ceed-hip-common.h
index c62c392abe..fb89216be5 100644
--- a/backends/hip/ceed-hip-common.h
+++ b/backends/hip/ceed-hip-common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -72,6 +72,7 @@ typedef struct {
   hipblasHandle_t        hipblas_handle;
   struct hipDeviceProp_t device_prop;
   int                    opt_block_size;
+  int                    has_unified_addressing;
 } Ceed_Hip;
 
 CEED_INTERN int CeedInit_Hip(Ceed ceed, const char *resource);
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index cafb79ed7f..e30bc07a02 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -33,12 +33,13 @@
 //------------------------------------------------------------------------------
 // Compile HIP kernel
 //------------------------------------------------------------------------------
-int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) {
+static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, hipModule_t *module,
+                               const CeedInt num_defines, va_list args) {
   size_t                 ptx_size;
-  char                  *jit_defs_source, *ptx;
-  const char            *jit_defs_path;
-  const int              num_opts = 3;
-  const char            *opts[num_opts];
+  char                  *ptx;
+  const int              num_opts            = 4;
+  CeedInt                num_jit_source_dirs = 0, num_jit_defines = 0;
+  const char           **opts;
   int                    runtime_version;
   hiprtcProgram          prog;
   struct hipDeviceProp_t prop;
@@ -62,8 +63,6 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
 
   // Kernel specific options, such as kernel constants
   if (num_defines > 0) {
-    va_list args;
-    va_start(args, num_defines);
     char *name;
     int   val;
 
@@ -72,24 +71,48 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
       val  = va_arg(args, int);
       code << "#define " << name << " " << val << "\n";
     }
-    va_end(args);
   }
 
   // Standard libCEED definitions for HIP backends
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-jit.h", &jit_defs_path));
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, jit_defs_path, &jit_defs_source));
-  code << jit_defs_source;
-  code << "\n\n";
-  CeedCallBackend(CeedFree(&jit_defs_path));
-  CeedCallBackend(CeedFree(&jit_defs_source));
+  code << "#include <ceed/jit-source/hip/hip-jit.h>\n\n";
 
   // Non-macro options
+  CeedCallBackend(CeedCalloc(num_opts, &opts));
   opts[0] = "-default-device";
   CeedCallBackend(CeedGetData(ceed, (void **)&ceed_data));
   CeedCallHip(ceed, hipGetDeviceProperties(&prop, ceed_data->device_id));
   std::string arch_arg = "--gpu-architecture=" + std::string(prop.gcnArchName);
   opts[1]              = arch_arg.c_str();
   opts[2]              = "-munsafe-fp-atomics";
+  opts[3]              = "-DCEED_RUNNING_JIT_PASS=1";
+  // Additional include dirs
+  {
+    const char **jit_source_dirs;
+
+    CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
+    CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts));
+    for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
+      std::ostringstream include_dir_arg;
+
+      include_dir_arg << "-I" << jit_source_dirs[i];
+      CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i]));
+    }
+    CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
+  }
+  // User defines
+  {
+    const char **jit_defines;
+
+    CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines));
+    CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts));
+    for (CeedInt i = 0; i < num_jit_defines; i++) {
+      std::ostringstream define_arg;
+
+      define_arg << "-D" << jit_defines[i];
+      CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i]));
+    }
+    CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines));
+  }
 
   // Add string source argument provided in call
   code << source;
@@ -98,19 +121,48 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   CeedCallHiprtc(ceed, hiprtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
 
   // Compile kernel
-  hiprtcResult result = hiprtcCompileProgram(prog, num_opts, opts);
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
+  CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JIT SOURCE ----------\n");
+  if (CeedDebugFlag(ceed)) {
+    // LCOV_EXCL_START
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n");
+    for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
+      CeedDebug(ceed, "Option %d: %s", i, opts[i]);
+    }
+    CeedDebug(ceed, "");
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n");
+    // LCOV_EXCL_STOP
+  }
+  hiprtcResult result = hiprtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
 
-  if (result != HIPRTC_SUCCESS) {
+  for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
+    CeedCallBackend(CeedFree(&opts[num_opts + i]));
+  }
+  for (CeedInt i = 0; i < num_jit_defines; i++) {
+    CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
+  }
+  CeedCallBackend(CeedFree(&opts));
+  *is_compile_good = result == HIPRTC_SUCCESS;
+  if (!*is_compile_good) {
     size_t log_size;
     char  *log;
 
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- CEED JIT SOURCE FAILED TO COMPILE ----------\n");
-    CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- CEED JIT SOURCE FAILED TO COMPILE ----------\n");
     CeedChk_hiprtc(ceed, hiprtcGetProgramLogSize(prog, &log_size));
     CeedCallBackend(CeedMalloc(log_size, &log));
     CeedCallHiprtc(ceed, hiprtcGetProgramLog(prog, log));
-    return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", hiprtcGetErrorString(result), log);
+    if (throw_error) {
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", hiprtcGetErrorString(result), log);
+    } else {
+      // LCOV_EXCL_START
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
+      CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", hiprtcGetErrorString(result), log);
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
+      CeedCallBackend(CeedFree(&log));
+      CeedCallHiprtc(ceed, hiprtcDestroyProgram(&prog));
+      return CEED_ERROR_SUCCESS;
+      // LCOV_EXCL_STOP
+    }
   }
 
   CeedCallHiprtc(ceed, hiprtcGetCodeSize(prog, &ptx_size));
@@ -123,6 +175,29 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   return CEED_ERROR_SUCCESS;
 }
 
+int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) {
+  bool    is_compile_good = true;
+  va_list args;
+
+  va_start(args, num_defines);
+  const CeedInt ierr = CeedCompileCore_Hip(ceed, source, true, &is_compile_good, module, num_defines, args);
+
+  va_end(args);
+  CeedCallBackend(ierr);
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedTryCompile_Hip(Ceed ceed, const char *source, bool *is_compile_good, hipModule_t *module, const CeedInt num_defines, ...) {
+  va_list args;
+
+  va_start(args, num_defines);
+  const CeedInt ierr = CeedCompileCore_Hip(ceed, source, false, is_compile_good, module, num_defines, args);
+
+  va_end(args);
+  CeedCallBackend(ierr);
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Get HIP kernel
 //------------------------------------------------------------------------------
@@ -151,9 +226,43 @@ int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, c
 //------------------------------------------------------------------------------
 // Run HIP kernel for spatial dimension with shared memory
 //------------------------------------------------------------------------------
-int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
-                               const int block_size_z, const int shared_mem_size, void **args) {
-  CeedCallHip(ceed, hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL));
+static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, const int grid_size, const int block_size_x,
+                                          const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error,
+                                          bool *is_good_run, void **args) {
+  hipError_t result = hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
+
+  if (result == hipSuccess) {
+    *is_good_run = true;
+  } else {
+    if (throw_error) {
+      CeedCallHip(ceed, result);
+    } else {
+      // LCOV_EXCL_START
+      const char *message = hipGetErrorName(result);
+
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
+      CeedDebug(ceed, "%s\n", message);
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
+      // LCOV_EXCL_STOP
+    }
+    *is_good_run = false;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, const int grid_size, const int block_size_x,
+                               const int block_size_y, const int block_size_z, const int shared_mem_size, void **args) {
+  bool is_good_run = true;
+
+  CeedCallBackend(CeedRunKernelDimSharedCore_Hip(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true,
+                                                 &is_good_run, args));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, const int grid_size, const int block_size_x,
+                                  const int block_size_y, const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
+  CeedCallBackend(CeedRunKernelDimSharedCore_Hip(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false,
+                                                 is_good_run, args));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip/ceed-hip-compile.h b/backends/hip/ceed-hip-compile.h
index d990924ec2..dd48fe4cd0 100644
--- a/backends/hip/ceed-hip-compile.h
+++ b/backends/hip/ceed-hip-compile.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -13,6 +13,7 @@
 static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; }
 
 CEED_INTERN int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...);
+CEED_INTERN int CeedTryCompile_Hip(Ceed ceed, const char *source, bool *is_compile_good, hipModule_t *module, const CeedInt num_defines, ...);
 
 CEED_INTERN int CeedGetKernel_Hip(Ceed ceed, hipModule_t module, const char *name, hipFunction_t *kernel);
 
@@ -21,5 +22,7 @@ CEED_INTERN int CeedRunKernel_Hip(Ceed ceed, hipFunction_t kernel, int grid_size
 CEED_INTERN int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
                                      void **args);
 
-CEED_INTERN int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
-                                           int shared_mem_size, void **args);
+CEED_INTERN int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, int grid_size, int block_size_x, int block_size_y,
+                                           int block_size_z, int shared_mem_size, void **args);
+CEED_INTERN int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, int grid_size, int block_size_x, int block_size_y,
+                                              int block_size_z, int shared_mem_size, bool *is_good_run, void **args);
diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index 3d00a64100..6ce6ce33b9 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -26,7 +26,8 @@
 //------------------------------------------------------------------------------
 // Basis apply - tensor
 //------------------------------------------------------------------------------
-static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
+static int CeedBasisApplyCore_Magma(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
+                                    CeedVector v) {
   Ceed              ceed;
   Ceed_Magma       *data;
   CeedInt           dim, num_comp, num_nodes, P_1d, Q_1d, P, Q;
@@ -52,7 +53,8 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
   // Read vectors
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Apply basis operation
   switch (e_mode) {
@@ -115,9 +117,10 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
       void   *args[] = {&impl->d_interp_1d, &d_u, &u_elem_stride, &u_comp_stride, &d_v, &v_elem_stride, &v_comp_stride, &num_elem};
 
       if (t_mode == CEED_TRANSPOSE) {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->InterpTransposeAdd : impl->InterpTranspose, NULL, grid, num_threads,
+                                                    num_t_col, 1, shared_mem, args));
       } else {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp, NULL, grid, num_threads, num_t_col, 1, shared_mem, args));
       }
     } break;
     case CEED_EVAL_GRAD: {
@@ -192,13 +195,15 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
                         &v_elem_stride,     &v_comp_stride,   &v_dim_stride, &num_elem};
 
       if (t_mode == CEED_TRANSPOSE) {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, apply_add ? impl->GradTransposeAdd : impl->GradTranspose, NULL, grid, num_threads,
+                                                    num_t_col, 1, shared_mem, args));
       } else {
-        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, grid, num_threads, num_t_col, 1, shared_mem, args));
+        CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad, NULL, grid, num_threads, num_t_col, 1, shared_mem, args));
       }
     } break;
     case CEED_EVAL_WEIGHT: {
       CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
+      CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[e_mode]);
       CeedInt elem_dofs_size = CeedIntPow(Q, dim);
       CeedInt num_threads    = 1;
       CeedInt num_t_col      = 1;
@@ -225,7 +230,7 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
       CeedInt grid   = CeedDivUpInt(num_elem, num_t_col);
       void   *args[] = {&impl->d_q_weight_1d, &d_v, &elem_dofs_size, &num_elem};
 
-      CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, num_threads, num_t_col, 1, shared_mem, args));
+      CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, NULL, grid, num_threads, num_t_col, 1, shared_mem, args));
     } break;
     // LCOV_EXCL_START
     case CEED_EVAL_DIV:
@@ -244,14 +249,33 @@ static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTranspose
     CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
   }
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Magma(basis, false, num_elem, t_mode, e_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAdd_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u, CeedVector v) {
+  CeedCallBackend(CeedBasisApplyCore_Magma(basis, true, num_elem, t_mode, e_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Basis apply - tensor AtPoints
+//------------------------------------------------------------------------------
+int CeedBasisApplyAtPoints_Magma(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                 CeedVector x_ref, CeedVector u, CeedVector v) {
+  return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "Backend does not implement CeedBasisApplyAtPoints");
+}
+
 //------------------------------------------------------------------------------
 // Basis apply - non-tensor
 //------------------------------------------------------------------------------
-static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
-                                         CeedVector v) {
+static int CeedBasisApplyNonTensorCore_Magma(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode,
+                                             CeedVector u, CeedVector v) {
   Ceed                      ceed;
   Ceed_Magma               *data;
   CeedInt                   num_comp, num_nodes, num_qpts, P, Q, N;
@@ -272,7 +296,8 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
   // Read vectors
   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
   // Compile kernels for N as needed
   CeedInt iN = 0;
@@ -335,16 +360,19 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
                                        impl->NB_deriv_t[iN]));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_n", &impl->Interp[iN]));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN]));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_ta", &impl->InterpTransposeAdd[iN]));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN]));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN]));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_ta", &impl->DerivTransposeAdd[iN]));
       if (!impl->Weight) {
         CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_weight_nontensor", &impl->Weight));
         CeedCallBackend(CeedFree(&weight_kernel_path));
       }
       CeedCallBackend(CeedFree(&basis_kernel_path));
       CeedCallBackend(CeedFree(&basis_kernel_source));
-      for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
-      CeedCall(CeedFree(&file_paths));
+      for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
+      CeedCallBackend(CeedFree(&file_paths));
+      CeedCallBackend(CeedDestroy(&ceed_delegate));
     }
   }
 
@@ -379,7 +407,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
     if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
       if (e_mode == CEED_EVAL_INTERP) {
         if (t_mode == CEED_TRANSPOSE) {
-          Kernel = impl->InterpTranspose[iN];
+          Kernel = apply_add ? impl->InterpTransposeAdd[iN] : impl->InterpTranspose[iN];
           NB     = impl->NB_interp_t[iN];
         } else {
           Kernel = impl->Interp[iN];
@@ -387,7 +415,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
         }
       } else {
         if (t_mode == CEED_TRANSPOSE) {
-          Kernel = impl->DerivTranspose[iN];
+          Kernel = apply_add ? impl->DerivTransposeAdd[iN] : impl->DerivTranspose[iN];
           NB     = impl->NB_deriv_t[iN];
         } else {
           Kernel = impl->Deriv[iN];
@@ -401,11 +429,11 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
       CeedInt shared_mem   = (t_mode != CEED_TRANSPOSE && q_comp > 1) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B);
       void   *args[]       = {&N, &d_b, &d_u, &d_v};
 
-      CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, Kernel, grid, M, num_t_col, 1, shared_mem, args));
+      CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, Kernel, NULL, grid, M, num_t_col, 1, shared_mem, args));
     } else {
       for (CeedInt d = 0; d < q_comp; d++) {
         if (t_mode == CEED_TRANSPOSE) {
-          const CeedScalar beta = (d > 0) ? 1.0 : 0.0;
+          const CeedScalar beta = (apply_add || (d > 0)) ? 1.0 : 0.0;
           magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, d_b + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, data->queue);
         } else {
           magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, d_b + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue);
@@ -414,12 +442,13 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
     }
   } else {
     CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
+    CeedCheck(impl->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight not set", CeedEvalModes[e_mode]);
     CeedInt num_t_col  = MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D);
     CeedInt grid       = CeedDivUpInt(num_elem, num_t_col);
     CeedInt shared_mem = Q * sizeof(CeedScalar) + num_t_col * Q * sizeof(CeedScalar);
     void   *args[]     = {&num_elem, &impl->d_q_weight, &d_v};
 
-    CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, Q, num_t_col, 1, shared_mem, args));
+    CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, NULL, grid, Q, num_t_col, 1, shared_mem, args));
   }
 
   // Must sync to ensure completeness
@@ -430,6 +459,19 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
     CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
   }
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
+                                         CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Magma(basis, false, num_elem, t_mode, e_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAddNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector u,
+                                            CeedVector v) {
+  CeedCallBackend(CeedBasisApplyNonTensorCore_Magma(basis, true, num_elem, t_mode, e_mode, u, v));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -449,8 +491,9 @@ static int CeedBasisDestroy_Magma(CeedBasis basis) {
 #endif
   CeedCallBackend(magma_free(impl->d_interp_1d));
   CeedCallBackend(magma_free(impl->d_grad_1d));
-  CeedCallBackend(magma_free(impl->d_q_weight_1d));
+  if (impl->d_q_weight_1d) CeedCallBackend(magma_free(impl->d_q_weight_1d));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -476,8 +519,9 @@ static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
   CeedCallBackend(magma_free(impl->d_grad));
   CeedCallBackend(magma_free(impl->d_div));
   CeedCallBackend(magma_free(impl->d_curl));
-  CeedCallBackend(magma_free(impl->d_q_weight));
+  if (impl->d_q_weight) CeedCallBackend(magma_free(impl->d_q_weight));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -500,8 +544,10 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallBackend(CeedCalloc(1, &impl));
 
   // Copy basis data to GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0])));
-  magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue);
+  if (q_weight_1d) {
+    CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0])));
+    magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue);
+  }
   CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0])));
   magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue);
   CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0])));
@@ -547,22 +593,28 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
     case 1:
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->Interp));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->InterpTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_1d_kernel", &impl->InterpTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->Grad));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->GradTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_1d_kernel", &impl->GradTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->Weight));
       break;
     case 2:
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->Interp));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->InterpTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_2d_kernel", &impl->InterpTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->Grad));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->GradTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_2d_kernel", &impl->GradTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->Weight));
       break;
     case 3:
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->Interp));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->InterpTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpta_3d_kernel", &impl->InterpTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->Grad));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->GradTranspose));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradta_3d_kernel", &impl->GradTransposeAdd));
       CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->Weight));
       break;
   }
@@ -570,13 +622,17 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   CeedCallBackend(CeedFree(&grad_kernel_path));
   CeedCallBackend(CeedFree(&weight_kernel_path));
   CeedCallBackend(CeedFree(&basis_kernel_source));
-  for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i]));
-  CeedCall(CeedFree(&file_paths));
+  for (CeedInt i = 0; i < num_file_paths; i++) CeedCallBackend(CeedFree(&file_paths[i]));
+  CeedCallBackend(CeedFree(&file_paths));
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_delegate));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -594,8 +650,10 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node
   CeedCallBackend(CeedCalloc(1, &impl));
 
   // Copy basis data to GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
-  magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  if (q_weight) {
+    CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
+    magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  }
   if (interp) {
     CeedInt q_comp_interp;
 
@@ -629,13 +687,16 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node
     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
     CeedCallBackend(CeedFree(&weight_kernel_path));
     CeedCallBackend(CeedFree(&basis_kernel_source));
+    CeedCallBackend(CeedDestroy(&ceed_delegate));
   }
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -653,8 +714,10 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no
   CeedCallBackend(CeedCalloc(1, &impl));
 
   // Copy basis data to GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
-  magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  if (q_weight) {
+    CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
+    magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  }
   if (interp) {
     CeedInt q_comp_interp;
 
@@ -688,13 +751,16 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no
     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
     CeedCallBackend(CeedFree(&weight_kernel_path));
     CeedCallBackend(CeedFree(&basis_kernel_source));
+    CeedCallBackend(CeedDestroy(&ceed_delegate));
   }
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -712,8 +778,10 @@ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_n
   CeedCallBackend(CeedCalloc(1, &impl));
 
   // Copy basis data to GPU
-  CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
-  magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  if (q_weight) {
+    CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0])));
+    magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue);
+  }
   if (interp) {
     CeedInt q_comp_interp;
 
@@ -747,13 +815,16 @@ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_n
     CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
     CeedCallBackend(CeedFree(&weight_kernel_path));
     CeedCallBackend(CeedFree(&basis_kernel_source));
+    CeedCallBackend(CeedDestroy(&ceed_delegate));
   }
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/magma/ceed-magma-common.c b/backends/magma/ceed-magma-common.c
index 592f216c6f..8e62e36b9c 100644
--- a/backends/magma/ceed-magma-common.c
+++ b/backends/magma/ceed-magma-common.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-common.h b/backends/magma/ceed-magma-common.h
index 5ebf9b0d10..83c313390e 100644
--- a/backends/magma/ceed-magma-common.h
+++ b/backends/magma/ceed-magma-common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-det.c b/backends/magma/ceed-magma-det.c
index 9b7125ccda..081cb6e7d9 100644
--- a/backends/magma/ceed-magma-det.c
+++ b/backends/magma/ceed-magma-det.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -35,6 +35,7 @@ static int CeedInit_Magma_Det(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedInit("/gpu/cuda/magma", &ceed_ref));
 #endif
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Magma));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/magma/ceed-magma-gemm-nontensor.cpp b/backends/magma/ceed-magma-gemm-nontensor.cpp
index 856b514acb..c43ff9266a 100644
--- a/backends/magma/ceed-magma-gemm-nontensor.cpp
+++ b/backends/magma/ceed-magma-gemm-nontensor.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-gemm-nontensor.h b/backends/magma/ceed-magma-gemm-nontensor.h
index 0431620b83..f7108b07c4 100644
--- a/backends/magma/ceed-magma-gemm-nontensor.h
+++ b/backends/magma/ceed-magma-gemm-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-gemm-selector.cpp b/backends/magma/ceed-magma-gemm-selector.cpp
index 46f963bca0..193c5ba4f5 100644
--- a/backends/magma/ceed-magma-gemm-selector.cpp
+++ b/backends/magma/ceed-magma-gemm-selector.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma-gemm-selector.h b/backends/magma/ceed-magma-gemm-selector.h
index c96c95c169..c199ef7dc2 100644
--- a/backends/magma/ceed-magma-gemm-selector.h
+++ b/backends/magma/ceed-magma-gemm-selector.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/ceed-magma.c b/backends/magma/ceed-magma.c
index 06254365b9..9908dd55da 100644
--- a/backends/magma/ceed-magma.c
+++ b/backends/magma/ceed-magma.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -36,6 +36,7 @@ static int CeedInit_Magma(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedInit("/gpu/cuda/ref", &ceed_ref));
 #endif
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Magma));
diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h
index aa60b37b40..c800f2a6ab 100644
--- a/backends/magma/ceed-magma.h
+++ b/backends/magma/ceed-magma.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -47,8 +47,10 @@ typedef struct {
   CeedMagmaModule   module;
   CeedMagmaFunction Interp;
   CeedMagmaFunction InterpTranspose;
+  CeedMagmaFunction InterpTransposeAdd;
   CeedMagmaFunction Grad;
   CeedMagmaFunction GradTranspose;
+  CeedMagmaFunction GradTransposeAdd;
   CeedMagmaFunction Weight;
   CeedScalar       *d_interp_1d;
   CeedScalar       *d_grad_1d;
@@ -59,8 +61,10 @@ typedef struct {
   CeedMagmaModule   module[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction InterpTransposeAdd[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction DerivTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction DerivTransposeAdd[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Weight;
   CeedInt           NB_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedInt           NB_deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_deriv_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
diff --git a/backends/magma/tuning/Makefile b/backends/magma/tuning/Makefile
index 37cfa194f7..bde10abd6e 100644
--- a/backends/magma/tuning/Makefile
+++ b/backends/magma/tuning/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/tuning/generate_tuning.py b/backends/magma/tuning/generate_tuning.py
index 10a2062881..2e3180ba2f 100644
--- a/backends/magma/tuning/generate_tuning.py
+++ b/backends/magma/tuning/generate_tuning.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/magma/tuning/tuning.cpp b/backends/magma/tuning/tuning.cpp
index 7a387c14b6..37f20863ae 100644
--- a/backends/magma/tuning/tuning.cpp
+++ b/backends/magma/tuning/tuning.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/memcheck/ceed-memcheck-blocked.c b/backends/memcheck/ceed-memcheck-blocked.c
index 4d9f557af5..009c9e4601 100644
--- a/backends/memcheck/ceed-memcheck-blocked.c
+++ b/backends/memcheck/ceed-memcheck-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -22,6 +22,7 @@ static int CeedInit_Memcheck(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/ref/blocked", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Memcheck));
diff --git a/backends/memcheck/ceed-memcheck-qfunction.c b/backends/memcheck/ceed-memcheck-qfunction.c
index 7c66e3601a..17d823d4ab 100644
--- a/backends/memcheck/ceed-memcheck-qfunction.c
+++ b/backends/memcheck/ceed-memcheck-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -17,24 +17,32 @@
 // QFunction Apply
 //------------------------------------------------------------------------------
 static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) {
-  Ceed                    ceed;
   void                   *ctx_data = NULL;
+  int                     input_block_ids[CEED_FIELD_MAX], output_block_ids[CEED_FIELD_MAX];
   CeedInt                 num_in, num_out;
   CeedQFunctionUser       f = NULL;
   CeedQFunctionField     *output_fields;
   CeedQFunction_Memcheck *impl;
 
-  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
   CeedCallBackend(CeedQFunctionGetData(qf, &impl));
   CeedCallBackend(CeedQFunctionGetContextData(qf, CEED_MEM_HOST, &ctx_data));
   CeedCallBackend(CeedQFunctionGetUserFunction(qf, &f));
   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_in, &num_out));
-  int mem_block_ids[num_out];
 
-  // Get input/output arrays
+  // Get input arrays
   for (CeedInt i = 0; i < num_in; i++) {
+    CeedSize len;
+    char     name[32] = "";
+
     CeedCallBackend(CeedVectorGetArrayRead(U[i], CEED_MEM_HOST, &impl->inputs[i]));
+
+    CeedCallBackend(CeedVectorGetLength(U[i], &len));
+
+    snprintf(name, 32, "QFunction input %" CeedInt_FMT, i);
+    input_block_ids[i] = VALGRIND_CREATE_BLOCK(impl->inputs[i], len, name);
   }
+
+  // Get output arrays
   for (CeedInt i = 0; i < num_out; i++) {
     CeedSize len;
     char     name[32] = "";
@@ -44,8 +52,8 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *
     CeedCallBackend(CeedVectorGetLength(V[i], &len));
     VALGRIND_MAKE_MEM_UNDEFINED(impl->outputs[i], len);
 
-    snprintf(name, 32, "'QFunction output %" CeedInt_FMT "'", i);
-    mem_block_ids[i] = VALGRIND_CREATE_BLOCK(impl->outputs[i], len, name);
+    snprintf(name, 32, "QFunction output %" CeedInt_FMT, i);
+    output_block_ids[i] = VALGRIND_CREATE_BLOCK(impl->outputs[i], len, name);
   }
 
   // Call user function
@@ -54,8 +62,10 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *
   // Restore input arrays
   for (CeedInt i = 0; i < num_in; i++) {
     CeedCallBackend(CeedVectorRestoreArrayRead(U[i], &impl->inputs[i]));
+    VALGRIND_DISCARD(input_block_ids[i]);
   }
-  // Check for unset output values
+
+  // Check for unset output values and restore arrays
   {
     const char *kernel_name, *kernel_path;
 
@@ -63,17 +73,19 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *
     CeedCallBackend(CeedQFunctionGetKernelName(qf, &kernel_name));
     CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &output_fields));
     for (CeedInt i = 0; i < num_out; i++) {
-      CeedInt field_size;
+      const char *field_name;
+      CeedInt     field_size;
 
       // Note: need field size because vector may be longer than needed for output
       CeedCallBackend(CeedQFunctionFieldGetSize(output_fields[i], &field_size));
+      CeedCallBackend(CeedQFunctionFieldGetName(output_fields[i], &field_name));
       for (CeedSize j = 0; j < field_size * (CeedSize)Q; j++) {
-        CeedCheck(!isnan(impl->outputs[i][j]), ceed, CEED_ERROR_BACKEND,
-                  "QFunction output %" CeedInt_FMT " entry %" CeedSize_FMT " is NaN after restoring write-only access: %s:%s ", i, j, kernel_path,
-                  kernel_name);
+        CeedCheck(!isnan(impl->outputs[i][j]), CeedQFunctionReturnCeed(qf), CEED_ERROR_BACKEND,
+                  "QFunction output %" CeedInt_FMT " '%s' entry %" CeedSize_FMT " is NaN after restoring write-only access: %s:%s ", i, field_name, j,
+                  kernel_path, kernel_name);
       }
       CeedCallBackend(CeedVectorRestoreArray(V[i], &impl->outputs[i]));
-      VALGRIND_DISCARD(mem_block_ids[i]);
+      VALGRIND_DISCARD(output_block_ids[i]);
     }
   }
   CeedCallBackend(CeedQFunctionRestoreContextData(qf, &ctx_data));
@@ -107,6 +119,7 @@ int CeedQFunctionCreate_Memcheck(CeedQFunction qf) {
   CeedCallBackend(CeedQFunctionSetData(qf, impl));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Memcheck));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/memcheck/ceed-memcheck-qfunctioncontext.c b/backends/memcheck/ceed-memcheck-qfunctioncontext.c
index 4da0d0ee68..01f67802c3 100644
--- a/backends/memcheck/ceed-memcheck-qfunctioncontext.c
+++ b/backends/memcheck/ceed-memcheck-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -20,7 +20,7 @@ static int CeedQFunctionContextHasValidData_Memcheck(CeedQFunctionContext ctx, b
   CeedQFunctionContext_Memcheck *impl;
 
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
-  *has_valid_data = impl->data;
+  *has_valid_data = !!impl->data_allocated;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -30,9 +30,10 @@ static int CeedQFunctionContextHasValidData_Memcheck(CeedQFunctionContext ctx, b
 static int CeedQFunctionContextHasBorrowedDataOfType_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) {
   CeedQFunctionContext_Memcheck *impl;
 
-  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
   CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
-  *has_borrowed_data_of_type = impl->data_borrowed;
+
+  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
+  *has_borrowed_data_of_type = !!impl->data_borrowed;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -43,35 +44,69 @@ static int CeedQFunctionContextSetData_Memcheck(CeedQFunctionContext ctx, CeedMe
   size_t                         ctx_size;
   CeedQFunctionContext_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
+
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
   CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
 
-  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
-
+  // Clear previous owned data buffers
+  if (impl->data_allocated) {
+    memset(impl->data_allocated, -42, ctx_size);
+    VALGRIND_DISCARD(impl->allocated_block_id);
+  }
   CeedCallBackend(CeedFree(&impl->data_allocated));
+  if (impl->data_owned) {
+    memset(impl->data_owned, -42, ctx_size);
+    VALGRIND_DISCARD(impl->owned_block_id);
+  }
   CeedCallBackend(CeedFree(&impl->data_owned));
+
+  // Clear borrowed block id, if present
+  if (impl->data_borrowed) VALGRIND_DISCARD(impl->borrowed_block_id);
+
+  // Set internal pointers to external buffers
   switch (copy_mode) {
     case CEED_COPY_VALUES:
-      CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_owned));
+      impl->data_owned    = NULL;
       impl->data_borrowed = NULL;
-      impl->data          = impl->data_owned;
-      memcpy(impl->data, data, ctx_size);
       break;
     case CEED_OWN_POINTER:
-      impl->data_owned    = data;
-      impl->data_borrowed = NULL;
-      impl->data          = data;
+      impl->data_owned     = data;
+      impl->data_borrowed  = NULL;
+      impl->owned_block_id = VALGRIND_CREATE_BLOCK(impl->data_owned, ctx_size, "Owned external data buffer");
       break;
     case CEED_USE_POINTER:
-      impl->data_borrowed = data;
-      impl->data          = data;
+      impl->data_owned     = NULL;
+      impl->data_borrowed  = data;
+      impl->owned_block_id = VALGRIND_CREATE_BLOCK(impl->data_borrowed, ctx_size, "Borrowed external data buffer");
   }
-  // Copy data to check ctx_size bounds
+
+  // Create internal data buffer
   CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_allocated));
-  memcpy(impl->data_allocated, impl->data, ctx_size);
-  impl->data = impl->data_allocated;
-  VALGRIND_DISCARD(impl->mem_block_id);
-  impl->mem_block_id = VALGRIND_CREATE_BLOCK(impl->data, ctx_size, "'QFunction backend context data copy'");
+  impl->allocated_block_id = VALGRIND_CREATE_BLOCK(impl->data_allocated, ctx_size, "'Allocated internal context data buffer");
+  memcpy(impl->data_allocated, data, ctx_size);
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Sync data
+//------------------------------------------------------------------------------
+static int CeedQFunctionContextSyncData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type) {
+  size_t                         ctx_size;
+  CeedQFunctionContext_Memcheck *impl;
+
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
+
+  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
+  CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
+
+  // Copy internal buffer back to owned or borrowed data buffer
+  if (impl->data_owned) {
+    memcpy(impl->data_owned, impl->data_allocated, ctx_size);
+  }
+  if (impl->data_borrowed) {
+    memcpy(impl->data_borrowed, impl->data_allocated, ctx_size);
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -79,16 +114,27 @@ static int CeedQFunctionContextSetData_Memcheck(CeedQFunctionContext ctx, CeedMe
 // QFunctionContext Take Data
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextTakeData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) {
+  size_t                         ctx_size;
   CeedQFunctionContext_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
+  CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
 
-  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+  // Synchronize memory
+  CeedCallBackend(CeedQFunctionContextSyncData_Memcheck(ctx, CEED_MEM_HOST));
 
+  // Return borrowed buffer
   *(void **)data      = impl->data_borrowed;
   impl->data_borrowed = NULL;
-  impl->data          = NULL;
-  VALGRIND_DISCARD(impl->mem_block_id);
+  VALGRIND_DISCARD(impl->borrowed_block_id);
+
+  // De-allocate internal memory
+  if (impl->data_allocated) {
+    memset(impl->data_allocated, -42, ctx_size);
+    VALGRIND_DISCARD(impl->allocated_block_id);
+  }
   CeedCallBackend(CeedFree(&impl->data_allocated));
   return CEED_ERROR_SUCCESS;
 }
@@ -97,13 +143,19 @@ static int CeedQFunctionContextTakeData_Memcheck(CeedQFunctionContext ctx, CeedM
 // QFunctionContext Get Data
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextGetData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) {
+  size_t                         ctx_size;
   CeedQFunctionContext_Memcheck *impl;
 
-  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
 
-  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
+  CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
 
-  *(void **)data = impl->data;
+  // Create and return writable buffer
+  CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_writable_copy));
+  impl->writable_block_id = VALGRIND_CREATE_BLOCK(impl->data_writable_copy, ctx_size, "Allocated writeable data buffer copy");
+  memcpy(impl->data_writable_copy, impl->data_allocated, ctx_size);
+  *(void **)data = impl->data_writable_copy;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -114,13 +166,18 @@ static int CeedQFunctionContextGetDataRead_Memcheck(CeedQFunctionContext ctx, Ce
   size_t                         ctx_size;
   CeedQFunctionContext_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
+
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
   CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
-  CeedCallBackend(CeedQFunctionContextGetData_Memcheck(ctx, mem_type, data));
 
-  // Make copy to verify no write occurred
-  CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_read_only_copy));
-  memcpy(impl->data_read_only_copy, *(void **)data, ctx_size);
+  // Create and return read-only buffer
+  if (!impl->data_read_only_copy) {
+    CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_read_only_copy));
+    impl->writable_block_id = VALGRIND_CREATE_BLOCK(impl->data_read_only_copy, ctx_size, "Allocated read-only data buffer copy");
+    memcpy(impl->data_read_only_copy, impl->data_allocated, ctx_size);
+  }
+  *(void **)data = impl->data_read_only_copy;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -134,8 +191,14 @@ static int CeedQFunctionContextRestoreData_Memcheck(CeedQFunctionContext ctx) {
   CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
 
-  if (impl->data_borrowed) memcpy(impl->data_borrowed, impl->data, ctx_size);
-  if (impl->data_owned) memcpy(impl->data_owned, impl->data, ctx_size);
+  // Copy back to internal buffer and sync
+  memcpy(impl->data_allocated, impl->data_writable_copy, ctx_size);
+  CeedCallBackend(CeedQFunctionContextSyncData_Memcheck(ctx, CEED_MEM_HOST));
+
+  // Invalidate writable buffer
+  memset(impl->data_writable_copy, -42, ctx_size);
+  CeedCallBackend(CeedFree(&impl->data_writable_copy));
+  VALGRIND_DISCARD(impl->writable_block_id);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -149,10 +212,15 @@ static int CeedQFunctionContextRestoreDataRead_Memcheck(CeedQFunctionContext ctx
   CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size));
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
 
-  CeedCheck(!memcmp(impl->data, impl->data_read_only_copy, ctx_size), CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND,
-            "Context data changed while accessed in read-only mode");
+  // Verify no changes made during read-only access
+  bool is_changed = memcmp(impl->data_allocated, impl->data_read_only_copy, ctx_size);
+
+  CeedCheck(!is_changed, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "Context data changed while accessed in read-only mode");
 
+  // Invalidate read-only buffer
+  memset(impl->data_read_only_copy, -42, ctx_size);
   CeedCallBackend(CeedFree(&impl->data_read_only_copy));
+  VALGRIND_DISCARD(impl->read_only_block_id);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -165,15 +233,31 @@ static int CeedQFunctionContextDataDestroy_Memcheck(CeedQFunctionContext ctx) {
   CeedQFunctionContext_Memcheck      *impl;
 
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
-  CeedCallBackend(CeedQFunctionContextGetDataDestroy(ctx, &data_destroy_mem_type, &data_destroy_function));
 
+  CeedCallBackend(CeedQFunctionContextGetDataDestroy(ctx, &data_destroy_mem_type, &data_destroy_function));
   CeedCheck(data_destroy_mem_type == CEED_MEM_HOST, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND,
             "Can only destroy HOST memory for this backend");
 
+  // Run user destroy routine
   if (data_destroy_function) {
-    CeedCallBackend(data_destroy_function(impl->data_borrowed ? impl->data_borrowed : impl->data_owned));
+    bool is_borrowed = !!impl->data_borrowed;
+
+    CeedCallBackend(data_destroy_function(is_borrowed ? impl->data_borrowed : impl->data_owned));
+    if (is_borrowed) VALGRIND_DISCARD(impl->borrowed_block_id);
+    else VALGRIND_DISCARD(impl->owned_block_id);
+  }
+  // Free allocations and discard block ids
+  if (impl->data_allocated) {
+    CeedCallBackend(CeedFree(&impl->data_allocated));
+    VALGRIND_DISCARD(impl->allocated_block_id);
+  }
+  if (impl->data_owned) {
+    CeedCallBackend(CeedFree(&impl->data_owned));
+    VALGRIND_DISCARD(impl->owned_block_id);
+  }
+  if (impl->data_borrowed) {
+    VALGRIND_DISCARD(impl->borrowed_block_id);
   }
-  CeedCallBackend(CeedFree(&impl->data_allocated));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -183,9 +267,19 @@ static int CeedQFunctionContextDataDestroy_Memcheck(CeedQFunctionContext ctx) {
 static int CeedQFunctionContextDestroy_Memcheck(CeedQFunctionContext ctx) {
   CeedQFunctionContext_Memcheck *impl;
 
+  // Free allocations and discard block ids
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
-  CeedCallBackend(CeedFree(&impl->data_allocated));
-  CeedCallBackend(CeedFree(&impl->data_owned));
+  if (impl->data_allocated) {
+    CeedCallBackend(CeedFree(&impl->data_allocated));
+    VALGRIND_DISCARD(impl->allocated_block_id);
+  }
+  if (impl->data_owned) {
+    CeedCallBackend(CeedFree(&impl->data_owned));
+    VALGRIND_DISCARD(impl->owned_block_id);
+  }
+  if (impl->data_borrowed) {
+    VALGRIND_DISCARD(impl->borrowed_block_id);
+  }
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
@@ -208,6 +302,7 @@ int CeedQFunctionContextCreate_Memcheck(CeedQFunctionContext ctx) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreDataRead", CeedQFunctionContextRestoreDataRead_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "DataDestroy", CeedQFunctionContextDataDestroy_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Memcheck));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/memcheck/ceed-memcheck-restriction.c b/backends/memcheck/ceed-memcheck-restriction.c
index f2877c3a69..e728d08d17 100644
--- a/backends/memcheck/ceed-memcheck-restriction.c
+++ b/backends/memcheck/ceed-memcheck-restriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -57,8 +57,8 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(CeedE
 
   // Apply restriction
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
-      CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
+      for (CeedSize n = 0; n < elem_size; n++) {
         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
               uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * (CeedSize)strides[2]];
@@ -78,7 +78,7 @@ static inline int CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(CeedEl
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) {
         vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride];
       }
@@ -96,7 +96,7 @@ static inline int CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core(Ceed
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) {
         vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] =
             uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0);
@@ -115,7 +115,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core(
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedSize n = 0;
 
       CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
@@ -125,7 +125,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core(
             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
                 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size];
       }
-      CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
+      for (n = 1; n < elem_size - 1; n++) {
         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
@@ -156,7 +156,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memche
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedSize n = 0;
 
       CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
@@ -166,7 +166,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memche
             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
                 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]);
       }
-      CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
+      for (n = 1; n < elem_size - 1; n++) {
         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
@@ -203,8 +203,8 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(CeedEle
 
   // Apply restriction
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
-      CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
+      for (CeedSize n = 0; n < elem_size; n++) {
         CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) {
           vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] +=
               uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset];
@@ -381,7 +381,7 @@ static inline int CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(CeedEl
       }
     } else {
       for (CeedSize i = 0; i < num_points; i++) {
-        for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] = uu[j * num_points + i + e_vec_offset];
+        for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] += uu[j * num_points + i + e_vec_offset];
       }
     }
     e_vec_offset += num_points * (CeedSize)num_comp;
@@ -420,8 +420,8 @@ static inline int CeedElemRestrictionApply_Memcheck_Core(CeedElemRestriction rst
     // Sum into for transpose mode
     switch (rstr_type) {
       case CEED_RESTRICTION_STRIDED:
-        CeedCallBackend(
-            CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
+        CeedCallBackend(CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset,
+                                                                               uu, vv));
         break;
       case CEED_RESTRICTION_STANDARD:
         CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
@@ -460,8 +460,8 @@ static inline int CeedElemRestrictionApply_Memcheck_Core(CeedElemRestriction rst
     // Overwrite for notranspose mode
     switch (rstr_type) {
       case CEED_RESTRICTION_STRIDED:
-        CeedCallBackend(
-            CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
+        CeedCallBackend(CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size,
+                                                                                 v_offset, uu, vv));
         break;
       case CEED_RESTRICTION_STANDARD:
         CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
@@ -673,6 +673,21 @@ int CeedElemRestrictionCreate_Memcheck(CeedMemType mem_type, CeedCopyMode copy_m
     }
   }
 
+  // Expand E-vector size for AtPoints
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedSize max_points = 0, num_points_total = 0;
+
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points = offsets[i + 1] - offsets[i];
+
+      max_points = CeedIntMax(max_points, num_points);
+      num_points_total += num_points;
+    }
+    // -- Increase size for last element
+    num_points_total += (max_points - (offsets[num_elem] - offsets[num_elem - 1]));
+    CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, num_points_total * num_comp));
+  }
+
   // Offsets data
   if (rstr_type != CEED_RESTRICTION_STRIDED) {
     // Check indices
@@ -745,14 +760,15 @@ int CeedElemRestrictionCreate_Memcheck(CeedMemType mem_type, CeedCopyMode copy_m
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Memcheck));
   if (rstr_type == CEED_RESTRICTION_POINTS) {
-    CeedCallBackend(
-        CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement", CeedElemRestrictionApplyAtPointsInElement_Memcheck));
+    CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement",
+                                           CeedElemRestrictionApplyAtPointsInElement_Memcheck));
   }
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyBlock", CeedElemRestrictionApplyBlock_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Memcheck));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/memcheck/ceed-memcheck-serial.c b/backends/memcheck/ceed-memcheck-serial.c
index f23a8013e6..a0140fbd75 100644
--- a/backends/memcheck/ceed-memcheck-serial.c
+++ b/backends/memcheck/ceed-memcheck-serial.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -23,6 +23,7 @@ static int CeedInit_Memcheck(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Memcheck));
diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c
index b12b7ead95..c5dd1fe56d 100644
--- a/backends/memcheck/ceed-memcheck-vector.c
+++ b/backends/memcheck/ceed-memcheck-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,6 +7,7 @@
 
 #include <ceed.h>
 #include <ceed/backend.h>
+#include <assert.h>
 #include <math.h>
 #include <stdbool.h>
 #include <string.h>
@@ -21,7 +22,7 @@ static int CeedVectorHasValidArray_Memcheck(CeedVector vec, bool *has_valid_arra
   CeedVector_Memcheck *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
-  *has_valid_array = impl->array;
+  *has_valid_array = !!impl->array_allocated;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -31,9 +32,10 @@ static int CeedVectorHasValidArray_Memcheck(CeedVector vec, bool *has_valid_arra
 static inline int CeedVectorHasBorrowedArrayOfType_Memcheck(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) {
   CeedVector_Memcheck *impl;
 
-  CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
-  *has_borrowed_array_of_type = impl->array_borrowed;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  *has_borrowed_array_of_type = !!impl->array_borrowed;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -44,39 +46,108 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
+
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only set HOST memory for this backend");
-
+  // Clear previous owned arrays
+  if (impl->array_allocated) {
+    for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = NAN;
+    VALGRIND_DISCARD(impl->allocated_block_id);
+  }
   CeedCallBackend(CeedFree(&impl->array_allocated));
-  CeedCallBackend(CeedFree(&impl->array_owned));
+  if (copy_mode != CEED_COPY_VALUES) {
+    if (impl->array_owned) {
+      for (CeedSize i = 0; i < length; i++) impl->array_owned[i] = NAN;
+      VALGRIND_DISCARD(impl->owned_block_id);
+    }
+    CeedCallBackend(CeedFree(&impl->array_owned));
+  }
+
+  // Clear borrowed block id, if present
+  if (impl->array_borrowed) VALGRIND_DISCARD(impl->borrowed_block_id);
+
+  // Set internal pointers to external arrays
   switch (copy_mode) {
     case CEED_COPY_VALUES:
-      CeedCallBackend(CeedCalloc(length, &impl->array_owned));
-      impl->array_borrowed = NULL;
-      impl->array          = impl->array_owned;
-      if (array) {
-        memcpy(impl->array, array, length * sizeof(array[0]));
-      } else {
-        for (CeedInt i = 0; i < length; i++) impl->array[i] = NAN;
-      }
+      // Nothing to update
       break;
     case CEED_OWN_POINTER:
       impl->array_owned    = array;
       impl->array_borrowed = NULL;
-      impl->array          = array;
+      impl->owned_block_id = VALGRIND_CREATE_BLOCK(impl->array_owned, length * sizeof(CeedScalar), "Owned external array buffer");
       break;
     case CEED_USE_POINTER:
-      impl->array_borrowed = array;
-      impl->array          = array;
+      impl->array_owned       = NULL;
+      impl->array_borrowed    = array;
+      impl->borrowed_block_id = VALGRIND_CREATE_BLOCK(impl->array_borrowed, length * sizeof(CeedScalar), "Borrowed external array buffer");
+      break;
   }
-  // Copy data to check access
+
+  // Create internal array data buffer
   CeedCallBackend(CeedCalloc(length, &impl->array_allocated));
-  memcpy(impl->array_allocated, impl->array, length * sizeof(array[0]));
-  impl->array = impl->array_allocated;
-  VALGRIND_DISCARD(impl->mem_block_id);
-  impl->mem_block_id = VALGRIND_CREATE_BLOCK(impl->array, length * sizeof(array[0]), "'Vector backend array data copy'");
+  impl->allocated_block_id = VALGRIND_CREATE_BLOCK(impl->array_allocated, length * sizeof(CeedScalar), "Allocated internal array buffer");
+  if (array) {
+    memcpy(impl->array_allocated, array, length * sizeof(CeedScalar));
+  } else {
+    for (CeedInt i = 0; i < length; i++) impl->array_allocated[i] = NAN;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Set internal array to value
+//------------------------------------------------------------------------------
+static int CeedVectorSetValue_Memcheck(CeedVector vec, CeedScalar value) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+
+  if (!impl->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, CEED_MEM_HOST, CEED_COPY_VALUES, NULL));
+  assert(impl->array_allocated);
+  for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = value;
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Set internal array to value strided
+//------------------------------------------------------------------------------
+static int CeedVectorSetValueStrided_Memcheck(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+
+  if (!impl->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, CEED_MEM_HOST, CEED_COPY_VALUES, NULL));
+  assert(impl->array_allocated);
+  if (stop == -1) stop = length;
+  for (CeedSize i = start; i < stop; i += step) impl->array_allocated[i] = val;
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Sync arrays
+//------------------------------------------------------------------------------
+static int CeedVectorSyncArray_Memcheck(const CeedVector vec, CeedMemType mem_type) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl;
+
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+
+  // Copy internal buffer back to owned or borrowed array
+  if (impl->array_owned) {
+    memcpy(impl->array_owned, impl->array_allocated, length * sizeof(CeedScalar));
+  }
+  if (impl->array_borrowed) {
+    memcpy(impl->array_borrowed, impl->array_allocated, length * sizeof(CeedScalar));
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -84,16 +155,27 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee
 // Vector Take Array
 //------------------------------------------------------------------------------
 static int CeedVectorTakeArray_Memcheck(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
+  CeedSize             length;
   CeedVector_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+
   CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+  // Synchronize memory
+  CeedCallBackend(CeedVectorSyncArray_Memcheck(vec, CEED_MEM_HOST));
 
+  // Return borrowed array
   (*array)             = impl->array_borrowed;
   impl->array_borrowed = NULL;
-  impl->array          = NULL;
-  VALGRIND_DISCARD(impl->mem_block_id);
+  VALGRIND_DISCARD(impl->borrowed_block_id);
+
+  // De-allocate internal memory
+  if (impl->array_allocated) {
+    for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] = NAN;
+    VALGRIND_DISCARD(impl->allocated_block_id);
+  }
   CeedCallBackend(CeedFree(&impl->array_allocated));
   return CEED_ERROR_SUCCESS;
 }
@@ -102,13 +184,19 @@ static int CeedVectorTakeArray_Memcheck(CeedVector vec, CeedMemType mem_type, Ce
 // Vector Get Array
 //------------------------------------------------------------------------------
 static int CeedVectorGetArray_Memcheck(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
+  CeedSize             length;
   CeedVector_Memcheck *impl;
 
-  CeedCallBackend(CeedVectorGetData(vec, &impl));
-
   CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
 
-  *array = impl->array;
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+
+  // Create and return writable buffer
+  CeedCallBackend(CeedCalloc(length, &impl->array_writable_copy));
+  impl->writable_block_id = VALGRIND_CREATE_BLOCK(impl->array_writable_copy, length * sizeof(CeedScalar), "Allocated writeable array buffer copy");
+  memcpy(impl->array_writable_copy, impl->array_allocated, length * sizeof(CeedScalar));
+  *array = impl->array_writable_copy;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -119,16 +207,18 @@ static int CeedVectorGetArrayRead_Memcheck(CeedVector vec, CeedMemType mem_type,
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  CeedCallBackend(CeedVectorGetArray_Memcheck(vec, mem_type, (CeedScalar **)array));
-
-  // Make copy to verify no write occurred
+  // Create and return read-only buffer
   if (!impl->array_read_only_copy) {
     CeedCallBackend(CeedCalloc(length, &impl->array_read_only_copy));
-    memcpy(impl->array_read_only_copy, *array, length * sizeof((*array)[0]));
+    impl->writable_block_id = VALGRIND_CREATE_BLOCK(impl->array_read_only_copy, length * sizeof(CeedScalar), "Allocated read-only array buffer copy");
+    memcpy(impl->array_read_only_copy, impl->array_allocated, length * sizeof(CeedScalar));
   }
+  *array = impl->array_read_only_copy;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -139,12 +229,18 @@ static int CeedVectorGetArrayWrite_Memcheck(CeedVector vec, CeedMemType mem_type
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
+  CeedCheck(mem_type == CEED_MEM_HOST, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend");
+
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  // Invalidate data to make sure no read occurs
-  if (!impl->array) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, mem_type, CEED_COPY_VALUES, NULL));
+  // Allocate buffer if necessary
+  if (!impl->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(vec, mem_type, CEED_COPY_VALUES, NULL));
+
+  // Get writable buffer
   CeedCallBackend(CeedVectorGetArray_Memcheck(vec, mem_type, array));
+
+  // Invalidate array data to prevent accidental reads
   for (CeedSize i = 0; i < length; i++) (*array)[i] = NAN;
   impl->is_write_only_access = true;
   return CEED_ERROR_SUCCESS;
@@ -154,27 +250,31 @@ static int CeedVectorGetArrayWrite_Memcheck(CeedVector vec, CeedMemType mem_type
 // Vector Restore Array
 //------------------------------------------------------------------------------
 static int CeedVectorRestoreArray_Memcheck(CeedVector vec) {
-  Ceed                 ceed;
   CeedSize             length;
   CeedVector_Memcheck *impl;
 
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
 
+  // Check for unset entries after write-only access
   if (impl->is_write_only_access) {
     for (CeedSize i = 0; i < length; i++) {
-      if (isnan(impl->array[i]))
-        CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "WARNING: Vec entry %" CeedSize_FMT " is NaN after restoring write-only access", i);
+      if (isnan(impl->array_writable_copy[i])) {
+        CeedDebug256(CeedVectorReturnCeed(vec), CEED_DEBUG_COLOR_WARNING,
+                     "WARNING: Vec entry %" CeedSize_FMT " is NaN after restoring write-only access", i);
+      }
     }
     impl->is_write_only_access = false;
   }
-  if (impl->array_borrowed) {
-    memcpy(impl->array_borrowed, impl->array, length * sizeof(impl->array[0]));
-  }
-  if (impl->array_owned) {
-    memcpy(impl->array_owned, impl->array, length * sizeof(impl->array[0]));
-  }
+
+  // Copy back to internal buffer and sync
+  memcpy(impl->array_allocated, impl->array_writable_copy, length * sizeof(CeedScalar));
+  CeedCallBackend(CeedVectorSyncArray_Memcheck(vec, CEED_MEM_HOST));
+
+  // Invalidate writable buffer
+  for (CeedSize i = 0; i < length; i++) impl->array_writable_copy[i] = NAN;
+  CeedCallBackend(CeedFree(&impl->array_writable_copy));
+  VALGRIND_DISCARD(impl->writable_block_id);
   return CEED_ERROR_SUCCESS;
 }
 
@@ -188,10 +288,93 @@ static int CeedVectorRestoreArrayRead_Memcheck(CeedVector vec) {
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
 
-  CeedCheck(!memcmp(impl->array, impl->array_read_only_copy, length * sizeof(impl->array[0])), CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
-            "Array data changed while accessed in read-only mode");
+  // Verify no changes made during read-only access
+  bool is_changed = memcmp(impl->array_allocated, impl->array_read_only_copy, length * sizeof(CeedScalar));
 
+  CeedCheck(!is_changed, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "Array data changed while accessed in read-only mode");
+
+  // Invalidate read-only buffer
+  for (CeedSize i = 0; i < length; i++) impl->array_read_only_copy[i] = NAN;
   CeedCallBackend(CeedFree(&impl->array_read_only_copy));
+  VALGRIND_DISCARD(impl->read_only_block_id);
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Take reciprocal of a vector
+//------------------------------------------------------------------------------
+static int CeedVectorReciprocal_Memcheck(CeedVector vec) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl;
+
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedVectorGetLength(vec, &length));
+
+  for (CeedSize i = 0; i < length; i++) {
+    if (fabs(impl->array_allocated[i]) > CEED_EPSILON) impl->array_allocated[i] = 1. / impl->array_allocated[i];
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Compute x = alpha x
+//------------------------------------------------------------------------------
+static int CeedVectorScale_Memcheck(CeedVector x, CeedScalar alpha) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl;
+
+  CeedCallBackend(CeedVectorGetData(x, &impl));
+  CeedCallBackend(CeedVectorGetLength(x, &length));
+
+  for (CeedSize i = 0; i < length; i++) impl->array_allocated[i] *= alpha;
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Compute y = alpha x + y
+//------------------------------------------------------------------------------
+static int CeedVectorAXPY_Memcheck(CeedVector y, CeedScalar alpha, CeedVector x) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl_x, *impl_y;
+
+  CeedCallBackend(CeedVectorGetData(x, &impl_x));
+  CeedCallBackend(CeedVectorGetData(y, &impl_y));
+  CeedCallBackend(CeedVectorGetLength(y, &length));
+
+  for (CeedSize i = 0; i < length; i++) impl_y->array_allocated[i] += alpha * impl_x->array_allocated[i];
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Compute y = alpha x + beta y
+//------------------------------------------------------------------------------
+static int CeedVectorAXPBY_Memcheck(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl_x, *impl_y;
+
+  CeedCallBackend(CeedVectorGetData(x, &impl_x));
+  CeedCallBackend(CeedVectorGetData(y, &impl_y));
+  CeedCallBackend(CeedVectorGetLength(y, &length));
+
+  for (CeedSize i = 0; i < length; i++) impl_y->array_allocated[i] = alpha * impl_x->array_allocated[i] + beta * impl_y->array_allocated[i];
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Compute the pointwise multiplication w = x .* y
+//------------------------------------------------------------------------------
+static int CeedVectorPointwiseMult_Memcheck(CeedVector w, CeedVector x, CeedVector y) {
+  CeedSize             length;
+  CeedVector_Memcheck *impl_x, *impl_y, *impl_w;
+
+  CeedCallBackend(CeedVectorGetData(x, &impl_x));
+  CeedCallBackend(CeedVectorGetData(y, &impl_y));
+  CeedCallBackend(CeedVectorGetData(w, &impl_w));
+  CeedCallBackend(CeedVectorGetLength(w, &length));
+
+  if (!impl_w->array_allocated) CeedCallBackend(CeedVectorSetArray_Memcheck(w, CEED_MEM_HOST, CEED_COPY_VALUES, NULL));
+  assert(impl_w->array_allocated);
+  for (CeedSize i = 0; i < length; i++) impl_w->array_allocated[i] = impl_x->array_allocated[i] * impl_y->array_allocated[i];
   return CEED_ERROR_SUCCESS;
 }
 
@@ -201,10 +384,19 @@ static int CeedVectorRestoreArrayRead_Memcheck(CeedVector vec) {
 static int CeedVectorDestroy_Memcheck(CeedVector vec) {
   CeedVector_Memcheck *impl;
 
+  // Free allocations and discard block ids
   CeedCallBackend(CeedVectorGetData(vec, &impl));
-  VALGRIND_DISCARD(impl->mem_block_id);
-  CeedCallBackend(CeedFree(&impl->array_allocated));
-  CeedCallBackend(CeedFree(&impl->array_owned));
+  if (impl->array_allocated) {
+    CeedCallBackend(CeedFree(&impl->array_allocated));
+    VALGRIND_DISCARD(impl->allocated_block_id);
+  }
+  if (impl->array_owned) {
+    CeedCallBackend(CeedFree(&impl->array_owned));
+    VALGRIND_DISCARD(impl->owned_block_id);
+  }
+  if (impl->array_borrowed) {
+    VALGRIND_DISCARD(impl->borrowed_block_id);
+  }
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
@@ -216,20 +408,28 @@ int CeedVectorCreate_Memcheck(CeedSize n, CeedVector vec) {
   Ceed                 ceed;
   CeedVector_Memcheck *impl;
 
-  CeedCallBackend(CeedCalloc(1, &impl));
-  CeedCallBackend(CeedVectorSetData(vec, impl));
-
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", CeedVectorSetValue_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", CeedVectorSetValueStrided_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArray", CeedVectorRestoreArray_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArrayRead", CeedVectorRestoreArrayRead_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", CeedVectorScale_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", CeedVectorAXPY_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", CeedVectorAXPBY_Memcheck));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Memcheck));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Memcheck));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedCalloc(1, &impl));
+  CeedCallBackend(CeedVectorSetData(vec, impl));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/memcheck/ceed-memcheck.h b/backends/memcheck/ceed-memcheck.h
index 603597fb1e..49f14e0270 100644
--- a/backends/memcheck/ceed-memcheck.h
+++ b/backends/memcheck/ceed-memcheck.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -10,13 +10,22 @@
 #include <ceed/backend.h>
 
 typedef struct {
-  int         mem_block_id;
-  bool        is_write_only_access;
-  CeedScalar *array;
+  // Internal array buffer
+  int         allocated_block_id;
   CeedScalar *array_allocated;
+  // Owned external array
+  int         owned_block_id;
   CeedScalar *array_owned;
+  // Borrowed external array
+  int         borrowed_block_id;
   CeedScalar *array_borrowed;
+  // Externally viewable read-only array
+  int         read_only_block_id;
   CeedScalar *array_read_only_copy;
+  // Externally viewable writable array
+  bool        is_write_only_access;
+  int         writable_block_id;
+  CeedScalar *array_writable_copy;
 } CeedVector_Memcheck;
 
 typedef struct {
@@ -31,18 +40,27 @@ typedef struct {
 } CeedElemRestriction_Memcheck;
 
 typedef struct {
+  bool               setup_done;
   const CeedScalar **inputs;
   CeedScalar       **outputs;
-  bool               setup_done;
 } CeedQFunction_Memcheck;
 
 typedef struct {
-  int   mem_block_id;
-  void *data;
+  // Internal data buffer
+  int   allocated_block_id;
   void *data_allocated;
+  // Owned external data
+  int   owned_block_id;
   void *data_owned;
+  // Borrowed external data
+  int   borrowed_block_id;
   void *data_borrowed;
+  // Externally viewable read-only data
+  int   read_only_block_id;
   void *data_read_only_copy;
+  // Externally viewable writable data
+  int   writable_block_id;
+  void *data_writable_copy;
 } CeedQFunctionContext_Memcheck;
 
 CEED_INTERN int CeedVectorCreate_Memcheck(CeedSize n, CeedVector vec);
diff --git a/backends/occa/ceed-occa-basis.cpp b/backends/occa/ceed-occa-basis.cpp
deleted file mode 100644
index 0c33da5453..0000000000
--- a/backends/occa/ceed-occa-basis.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-basis.hpp"
-
-#include "ceed-occa-tensor-basis.hpp"
-
-namespace ceed {
-namespace occa {
-Basis::Basis() : ceedComponentCount(0), dim(0), P(0), Q(0) {}
-
-Basis::~Basis() {}
-
-Basis *Basis::getBasis(CeedBasis basis, const bool assertValid) {
-  if (!basis) {
-    return NULL;
-  }
-
-  int    ierr;
-  Basis *basis_ = NULL;
-
-  ierr = CeedBasisGetData(basis, &basis_);
-  if (assertValid) {
-    CeedOccaFromChk(ierr);
-  }
-
-  return basis_;
-}
-
-Basis *Basis::from(CeedBasis basis) {
-  Basis *basis_ = getBasis(basis);
-  if (!basis_) {
-    return NULL;
-  }
-
-  CeedCallOcca(basis_->setCeedFields(basis));
-
-  return basis_;
-}
-
-Basis *Basis::from(CeedOperatorField operatorField) {
-  CeedBasis basis;
-  CeedCallOcca(CeedOperatorFieldGetBasis(operatorField, &basis));
-  return from(basis);
-}
-
-int Basis::setCeedFields(CeedBasis basis) {
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
-  CeedCallBackend(CeedBasisGetNumComponents(basis, &ceedComponentCount));
-
-  return CEED_ERROR_SUCCESS;
-}
-
-//---[ Ceed Callbacks ]-----------
-int Basis::registerCeedFunction(Ceed ceed, CeedBasis basis, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "Basis", basis, fname, f);
-}
-
-int Basis::ceedApply(CeedBasis basis, const CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector u, CeedVector v) {
-  Basis  *basis_ = Basis::from(basis);
-  Vector *U      = Vector::from(u);
-  Vector *V      = Vector::from(v);
-
-  if (!basis_) {
-    return staticCeedError("Incorrect CeedBasis argument: op");
-  }
-
-  return basis_->apply(nelem, tmode, emode, U, V);
-}
-
-int Basis::ceedDestroy(CeedBasis basis) {
-  delete getBasis(basis, false);
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-basis.hpp b/backends/occa/ceed-occa-basis.hpp
deleted file mode 100644
index 2fe01ec052..0000000000
--- a/backends/occa/ceed-occa-basis.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_BASIS_HEADER
-#define CEED_OCCA_BASIS_HEADER
-
-#include "ceed-occa-ceed-object.hpp"
-#include "ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-class Basis : public CeedObject {
- public:
-  // Ceed object information
-  CeedInt ceedComponentCount;
-
-  // Owned information
-  CeedInt dim;
-  CeedInt P;
-  CeedInt Q;
-
-  Basis();
-
-  virtual ~Basis();
-
-  static Basis *getBasis(CeedBasis basis, const bool assertValid = true);
-
-  static Basis *from(CeedBasis basis);
-  static Basis *from(CeedOperatorField operatorField);
-
-  int setCeedFields(CeedBasis basis);
-
-  virtual bool isTensorBasis() const = 0;
-
-  virtual const char *getFunctionSource() const = 0;
-
-  virtual int apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *u, Vector *v) = 0;
-
-  //---[ Ceed Callbacks ]-----------
-  static int registerCeedFunction(Ceed ceed, CeedBasis basis, const char *fname, ceed::occa::ceedFunction f);
-
-  static int ceedApply(CeedBasis basis, const CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector u, CeedVector v);
-
-  static int ceedDestroy(CeedBasis basis);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-ceed-object.cpp b/backends/occa/ceed-occa-ceed-object.cpp
deleted file mode 100644
index c6dd28fef2..0000000000
--- a/backends/occa/ceed-occa-ceed-object.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-ceed-object.hpp"
-
-#include "ceed-occa-context.hpp"
-
-namespace ceed {
-namespace occa {
-CeedObject::CeedObject(Ceed ceed_) : ceed(ceed_) {}
-
-::occa::device CeedObject::getDevice() {
-  if (!_device.isInitialized()) {
-    _device = Context::from(ceed)->device;
-  }
-  return _device;
-}
-
-bool CeedObject::usingCpuDevice() const { return Context::from(ceed)->usingCpuDevice(); }
-
-bool CeedObject::usingGpuDevice() const { return Context::from(ceed)->usingGpuDevice(); }
-
-int CeedObject::ceedError(const std::string &message) const { return CeedError(ceed, CEED_ERROR_BACKEND, message.c_str()); }
-
-int CeedObject::staticCeedError(const std::string &message) { return CeedError(NULL, CEED_ERROR_BACKEND, message.c_str()); }
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-ceed-object.hpp b/backends/occa/ceed-occa-ceed-object.hpp
deleted file mode 100644
index 46235cbad5..0000000000
--- a/backends/occa/ceed-occa-ceed-object.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_CEEDOBJECT_HEADER
-#define CEED_OCCA_CEEDOBJECT_HEADER
-
-#include "ceed-occa-context.hpp"
-
-namespace ceed {
-namespace occa {
-class CeedObject {
- private:
-  ::occa::device _device;
-
- public:
-  Ceed ceed;
-
-  CeedObject(Ceed ceed_ = NULL);
-
-  ::occa::device getDevice();
-
-  bool usingCpuDevice() const;
-  bool usingGpuDevice() const;
-
-  int        ceedError(const std::string &message) const;
-  static int staticCeedError(const std::string &message);
-};
-
-namespace SyncState {
-static const int none   = 0;
-static const int host   = (1 << 0);
-static const int device = (1 << 1);
-static const int all    = host | device;
-}  // namespace SyncState
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-context.cpp b/backends/occa/ceed-occa-context.cpp
deleted file mode 100644
index 4a705147de..0000000000
--- a/backends/occa/ceed-occa-context.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-context.hpp"
-
-namespace ceed {
-namespace occa {
-Context::Context(::occa::device device_) : device(device_) {
-  const std::string mode = device.mode();
-  _usingCpuDevice        = (mode == "Serial" || mode == "OpenMP");
-  _usingGpuDevice        = (mode == "CUDA" || mode == "HIP" || mode == "OpenCL");
-}
-
-Context *Context::from(Ceed ceed) {
-  if (!ceed) {
-    return NULL;
-  }
-
-  Context *context;
-  CeedGetData(ceed, (void **)&context);
-  return context;
-}
-
-bool Context::usingCpuDevice() const { return _usingCpuDevice; }
-
-bool Context::usingGpuDevice() const { return _usingGpuDevice; }
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-context.hpp b/backends/occa/ceed-occa-context.hpp
deleted file mode 100644
index 3e1586082b..0000000000
--- a/backends/occa/ceed-occa-context.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_CONTEXT_HEADER
-#define CEED_OCCA_CONTEXT_HEADER
-
-#include "ceed-occa-types.hpp"
-
-namespace ceed {
-namespace occa {
-class Context {
- private:
-  bool _usingCpuDevice;
-  bool _usingGpuDevice;
-
- public:
-  ::occa::device device;
-
-  Context(::occa::device device_);
-
-  static Context *from(Ceed ceed);
-
-  bool usingCpuDevice() const;
-  bool usingGpuDevice() const;
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-cpu-operator.cpp b/backends/occa/ceed-occa-cpu-operator.cpp
deleted file mode 100644
index cf5bd3fe59..0000000000
--- a/backends/occa/ceed-occa-cpu-operator.cpp
+++ /dev/null
@@ -1,751 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-cpu-operator.hpp"
-
-#include "ceed-occa-elem-restriction.hpp"
-#include "ceed-occa-qfunction.hpp"
-#include "ceed-occa-qfunctioncontext.hpp"
-#include "ceed-occa-simplex-basis.hpp"
-#include "ceed-occa-tensor-basis.hpp"
-
-#define CEED_OCCA_PRINT_KERNEL_HASHES 0
-
-namespace ceed {
-namespace occa {
-CpuOperator::CpuOperator() {}
-
-CpuOperator::~CpuOperator() {}
-
-void CpuOperator::setupVectors() {
-  setupVectors(args.inputCount(), args.opInputs, args.qfInputs, dofInputs);
-  setupVectors(args.outputCount(), args.opOutputs, args.qfOutputs, dofOutputs);
-}
-
-void CpuOperator::setupVectors(const int fieldCount, OperatorFieldVector &opFields, QFunctionFieldVector &qfFields, VectorVector &vectors) {
-  for (int i = 0; i < fieldCount; ++i) {
-    const QFunctionField &qfField = qfFields[i];
-    const OperatorField  &opField = opFields[i];
-
-    if (qfField.evalMode == CEED_EVAL_WEIGHT) {
-      // Weight kernel doesn't use the input
-      vectors.push_back(NULL);
-      continue;
-    }
-
-    int entries;
-    if (qfField.evalMode == CEED_EVAL_NONE) {
-      // The output vector stores values at quadrature points
-      entries = (ceedElementCount * ceedQ * qfField.size);
-    } else {
-      // The output vector stores the element dof values
-      entries = (ceedElementCount * opField.getElementSize() * opField.getComponentCount());
-    }
-
-    Vector *dofVector = new Vector();
-    dofVector->ceed   = ceed;
-    dofVector->resize(entries);
-
-    vectors.push_back(dofVector);
-  }
-}
-
-void CpuOperator::freeVectors() {
-  for (int i = 0; i < args.inputCount(); ++i) {
-    delete dofInputs[i];
-  }
-  for (int i = 0; i < args.outputCount(); ++i) {
-    delete dofOutputs[i];
-  }
-  dofInputs.clear();
-  dofOutputs.clear();
-}
-
-void CpuOperator::setupInputs(Vector *in) {
-  for (int i = 0; i < args.inputCount(); ++i) {
-    // Weight kernel doesn't use the input vector
-    if (args.getInputEvalMode(i) == CEED_EVAL_WEIGHT) {
-      continue;
-    }
-
-    const OperatorField &opField = args.getOpInput(i);
-
-    Vector *input  = opField.usesActiveVector() ? in : opField.vec;
-    Vector *output = dofInputs[i];
-
-    opField.elemRestriction->apply(CEED_NOTRANSPOSE, *input, *output);
-  }
-}
-
-void CpuOperator::setupOutputs(Vector *out) {
-  for (int i = 0; i < args.outputCount(); ++i) {
-    // Weight is not supported for output vectors
-    if (args.getOutputEvalMode(i) == CEED_EVAL_WEIGHT) {
-      continue;
-    }
-
-    const OperatorField &opField = args.getOpOutput(i);
-
-    Vector *input  = dofOutputs[i];
-    Vector *output = opField.usesActiveVector() ? out : opField.vec;
-
-    opField.elemRestriction->apply(CEED_TRANSPOSE, *input, *output);
-  }
-}
-
-void CpuOperator::applyQFunction() {
-  if (qfunction->qFunctionContext) {
-    QFunctionContext *ctx = QFunctionContext::from(qfunction->qFunctionContext);
-    applyAddKernel.pushArg(ctx->getKernelArg());
-  } else {
-    applyAddKernel.pushArg(::occa::null);
-  }
-  applyAddKernel.pushArg(ceedElementCount);
-
-  for (int i = 0; i < args.inputCount(); ++i) {
-    const bool isInput = true;
-    pushKernelArgs(dofInputs[i], isInput, i);
-  }
-
-  for (int i = 0; i < args.outputCount(); ++i) {
-    const bool isInput = false;
-    pushKernelArgs(dofOutputs[i], isInput, i);
-  }
-
-  applyAddKernel.run();
-}
-
-void CpuOperator::pushKernelArgs(Vector *vec, const bool isInput, const int index) {
-  const OperatorField  &opField = args.getOpField(isInput, index);
-  const QFunctionField &qfField = args.getQfField(isInput, index);
-
-  if (opField.hasBasis()) {
-    if (opField.usingTensorBasis()) {
-      pushTensorBasisKernelArgs(qfField, *((TensorBasis *)opField.basis));
-    } else {
-      pushSimplexBasisKernelArgs(qfField, *((SimplexBasis *)opField.basis));
-    }
-  }
-
-  if (vec) {
-    if (isInput) {
-      applyAddKernel.pushArg(vec->getConstKernelArg());
-    } else {
-      applyAddKernel.pushArg(vec->getKernelArg());
-    }
-  } else {
-    applyAddKernel.pushArg(::occa::null);
-  }
-}
-
-void CpuOperator::pushTensorBasisKernelArgs(const QFunctionField &qfField, TensorBasis &basis) {
-  switch (qfField.evalMode) {
-    case CEED_EVAL_INTERP: {
-      applyAddKernel.pushArg(basis.interp1D);
-      break;
-    }
-    case CEED_EVAL_GRAD: {
-      applyAddKernel.pushArg(basis.interp1D);
-      applyAddKernel.pushArg(basis.grad1D);
-      break;
-    }
-    case CEED_EVAL_WEIGHT: {
-      applyAddKernel.pushArg(basis.qWeight1D);
-      break;
-    }
-    default: {
-    }
-  }
-}
-
-void CpuOperator::pushSimplexBasisKernelArgs(const QFunctionField &qfField, SimplexBasis &basis) {
-  switch (qfField.evalMode) {
-    case CEED_EVAL_INTERP: {
-      applyAddKernel.pushArg(basis.interp);
-      break;
-    }
-    case CEED_EVAL_GRAD: {
-      applyAddKernel.pushArg(basis.grad);
-      break;
-    }
-    case CEED_EVAL_WEIGHT: {
-      applyAddKernel.pushArg(basis.qWeight);
-      break;
-    }
-    default: {
-    }
-  }
-}
-
-::occa::properties CpuOperator::getKernelProps() {
-  ::occa::properties props = qfunction->getKernelProps(ceedQ);
-
-  props["defines/OCCA_Q"] = ceedQ;
-
-  return props;
-}
-
-void CpuOperator::applyAdd(Vector *in, Vector *out) {
-  // Setup helper vectors
-  setupVectors();
-
-  // Dof nodes -> local dofs
-  setupInputs(in);
-
-  // Apply qFunction
-  applyQFunction();
-
-  // Local dofs -> dof nodes
-  setupOutputs(out);
-
-  // Cleanup helper vectors
-  freeVectors();
-}
-
-::occa::kernel CpuOperator::buildApplyAddKernel() {
-  std::stringstream ss;
-
-  addBasisFunctionSource(ss);
-
-  addKernelSource(ss);
-
-  const std::string kernelSource = ss.str();
-
-  CeedDebug(ceed, kernelSource.c_str());
-
-  // TODO: Store a kernel per Q
-  return getDevice().buildKernelFromString(kernelSource, "applyAdd", getKernelProps());
-}
-
-//---[ Kernel Generation ]--------------------
-void CpuOperator::addBasisFunctionSource(std::stringstream &ss) {
-  BasisVector sourceBasis;
-  for (int i = 0; i < args.inputCount(); ++i) {
-    addBasisIfMissingSource(sourceBasis, args.getOpInput(i).basis);
-  }
-  for (int i = 0; i < args.outputCount(); ++i) {
-    addBasisIfMissingSource(sourceBasis, args.getOpOutput(i).basis);
-  }
-
-  // Make sure there's a break between past code
-  ss << std::endl;
-
-  // Add source code for each unique basis function
-  const int basisCount = (int)sourceBasis.size();
-  for (int i = 0; i < basisCount; ++i) {
-    Basis &basis = *(sourceBasis[i]);
-
-    ss << "// Code generation for basis " << i + 1 << std::endl << "//---[ START ]-------------------------------" << std::endl;
-
-    // Undefine and redefine required variables
-    if (basis.isTensorBasis()) {
-      TensorBasis &basisTensor = (TensorBasis &)basis;
-      ss << "#undef  TENSOR_FUNCTION" << std::endl
-         << "#undef  P1D" << std::endl
-         << "#undef  Q1D" << std::endl
-         << "#define P1D " << basisTensor.P1D << std::endl
-         << "#define Q1D " << basisTensor.Q1D << std::endl;
-    } else {
-      SimplexBasis &basisSimplex = (SimplexBasis &)basis;
-      ss << "#undef  SIMPLEX_FUNCTION" << std::endl
-         << "#undef  DIM" << std::endl
-         << "#undef  P" << std::endl
-         << "#undef  Q" << std::endl
-         << "#define DIM " << basisSimplex.dim << std::endl
-         << "#define P   " << basisSimplex.P << std::endl
-         << "#define Q   " << basisSimplex.Q << std::endl;
-    }
-
-    ss << std::endl << basis.getFunctionSource() << std::endl << "//---[ END ]---------------------------------" << std::endl;
-  }
-}
-
-void CpuOperator::addBasisIfMissingSource(BasisVector &sourceBasis, Basis *basis) {
-  // Avoid adding duplicate sources which will result in colliding symbol names
-
-  // No basis
-  if (!basis) {
-    return;
-  }
-
-  // Fast enough since we expect a small number of inputs/outputs
-  const int existingBasisCount = (int)sourceBasis.size();
-  for (int i = 0; i < existingBasisCount; ++i) {
-    Basis *other = sourceBasis[i];
-    // They are different basis types so other != basis
-    if (basis->isTensorBasis() != other->isTensorBasis()) {
-      continue;
-    }
-
-    if (basis->dim == other->dim && basis->P == other->P && basis->Q == other->Q) {
-      // `other` wil generate the same code
-      return;
-    }
-  }
-
-  // Basis didn't match any other existing basis
-  sourceBasis.push_back(basis);
-}
-
-void CpuOperator::addKernelSource(std::stringstream &ss) {
-  // Make sure there's a break between past code
-  ss << std::endl;
-
-  ss << "@kernel void applyAdd(" << std::endl;
-
-  addKernelArgsSource(ss);
-
-  ss << std::endl
-     << ") {" << std::endl
-     << "  @tile(128, @outer, @inner)" << std::endl
-     << "  for (int element = 0; element < elementCount; ++element) {" << std::endl;
-
-#if CEED_OCCA_PRINT_KERNEL_HASHES
-  // Print to see which kernel is being run
-  ss << "    if (element == 0) {" << std::endl
-     << "      printf(\"\\n\\nOperator Kernel: \" OKL_KERNEL_HASH \"\\n\\n\");" << std::endl
-     << "    }" << std::endl;
-#endif
-
-  addQuadArraySource(ss);
-
-  ss << std::endl << "    // [Start] Transforming inputs to quadrature points" << std::endl;
-  addInputSetupSource(ss);
-  ss << "    // [End] Transforming inputs to quadrature points" << std::endl << std::endl;
-
-  addQFunctionApplicationSource(ss);
-
-  ss << std::endl << "    // [Start] Transforming outputs to quadrature points" << std::endl;
-  addOutputSetupSource(ss);
-  ss << "    // [End] Transforming outputs to quadrature points" << std::endl;
-
-  ss << "  }" << std::endl << "}" << std::endl;
-}
-
-void CpuOperator::addKernelArgsSource(std::stringstream &ss) {
-  ss << "  void *ctx," << std::endl << "  const CeedInt elementCount";
-
-  for (int i = 0; i < args.inputCount(); ++i) {
-    const bool isInput = true;
-    addKernelArgSource(ss, isInput, i);
-  }
-  for (int i = 0; i < args.outputCount(); ++i) {
-    const bool isInput = false;
-    addKernelArgSource(ss, isInput, i);
-  }
-}
-
-void CpuOperator::addKernelArgSource(std::stringstream &ss, const bool isInput, const int index) {
-  const OperatorField  &opField = args.getOpField(isInput, index);
-  const QFunctionField &qfField = args.getQfField(isInput, index);
-
-  std::stringstream dimAttribute;
-  if (opField.hasBasis()) {
-    ss << ',' << std::endl;
-    if (opField.usingTensorBasis()) {
-      addTensorKernelArgSource(ss, isInput, index, opField, qfField, dimAttribute);
-    } else {
-      addSimplexKernelArgSource(ss, isInput, index, opField, qfField, dimAttribute);
-    }
-  }
-
-  ss << ',' << std::endl;
-  if (isInput) {
-    ss << "  const CeedScalar *" << dofInputVar(index) << dimAttribute.str();
-  } else {
-    ss << "  CeedScalar *" << dofOutputVar(index) << dimAttribute.str();
-  }
-}
-
-void CpuOperator::addTensorKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField,
-                                           const QFunctionField &qfField, std::stringstream &dimAttribute) {
-  TensorBasis &basis = *((TensorBasis *)opField.basis);
-
-  dimAttribute << " @dim(";
-
-  if (qfField.evalMode == CEED_EVAL_INTERP) {
-    ss << "  const CeedScalar *" << interpVar(isInput, index);
-
-    // @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount)
-    for (int i = 0; i < basis.dim; ++i) {
-      dimAttribute << basis.P1D << ", ";
-    }
-    dimAttribute << basis.ceedComponentCount << ", elementCount";
-  } else if (qfField.evalMode == CEED_EVAL_GRAD) {
-    ss << "  const CeedScalar *" << interpVar(isInput, index) << ',' << std::endl << "  const CeedScalar *" << gradVar(isInput, index);
-
-    // @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount)
-    for (int i = 0; i < basis.dim; ++i) {
-      dimAttribute << basis.P1D << ", ";
-    }
-    dimAttribute << basis.ceedComponentCount << ", elementCount";
-  } else if (qfField.evalMode == CEED_EVAL_WEIGHT) {
-    ss << "  const CeedScalar *" << qWeightVar(isInput, index);
-
-    // @dim(Q1D, Q1D, elementCount)
-    for (int i = 0; i < basis.dim; ++i) {
-      dimAttribute << basis.Q1D << ", ";
-    }
-    dimAttribute << "elementCount";
-  } else {
-    // Clear @dim
-    dimAttribute.str("");
-    return;
-  }
-
-  dimAttribute << ")";
-}
-
-void CpuOperator::addSimplexKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField,
-                                            const QFunctionField &qfField, std::stringstream &dimAttribute) {
-  SimplexBasis &basis = *((SimplexBasis *)opField.basis);
-
-  dimAttribute << " @dim(";
-
-  if (qfField.evalMode == CEED_EVAL_INTERP) {
-    ss << "  const CeedScalar *" << interpVar(isInput, index);
-
-    // @dim(P, BASIS_COMPONENT_COUNT, elementCount)
-    dimAttribute << basis.P << ", " << basis.ceedComponentCount << ", elementCount";
-  } else if (qfField.evalMode == CEED_EVAL_GRAD) {
-    ss << "  const CeedScalar *" << gradVar(isInput, index);
-
-    // @dim(P, BASIS_COMPONENT_COUNT, elementCount)
-    dimAttribute << basis.P << ", " << basis.ceedComponentCount << ", elementCount";
-  } else if (qfField.evalMode == CEED_EVAL_WEIGHT) {
-    ss << "  const CeedScalar *" << qWeightVar(isInput, index);
-
-    // @dim(Q, elementCount)
-    dimAttribute << basis.Q << ", "
-                 << "elementCount";
-  } else {
-    // Clear @dim
-    dimAttribute.str("");
-    return;
-  }
-
-  dimAttribute << ")";
-}
-
-void CpuOperator::addQuadArraySource(std::stringstream &ss) {
-  const int inputs  = args.inputCount();
-  const int outputs = args.outputCount();
-
-  const std::string quadInput  = "quadInput";
-  const std::string quadOutput = "quadOutput";
-
-  ss << "    // Store the transformed input quad values" << std::endl;
-  for (int i = 0; i < inputs; ++i) {
-    const bool isInput = true;
-    addSingleQfunctionQuadArraySource(ss, isInput, i, quadInput);
-  }
-
-  ss << std::endl << "    // Store the transformed output quad values" << std::endl;
-  for (int i = 0; i < outputs; ++i) {
-    const bool isInput = false;
-    addSingleQfunctionQuadArraySource(ss, isInput, i, quadOutput);
-  }
-  ss << std::endl;
-
-  ss << std::endl << "    // Store all input pointers in a single array" << std::endl;
-  addQfunctionQuadArraySource(ss, true, inputs, quadInput);
-
-  ss << std::endl << "    // Store all output pointers in a single array" << std::endl;
-  addQfunctionQuadArraySource(ss, false, outputs, quadOutput);
-
-  ss << std::endl;
-}
-
-void CpuOperator::addSingleQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int index, const std::string &name) {
-  // Output:
-  //   CeedScalar quadInput0[DIM][COMPONENTS][OCCA_Q];
-  //   CeedScalar quadInput0[OCCA_Q * SIZE];
-
-  const OperatorField &opField  = args.getOpField(isInput, index);
-  CeedEvalMode         evalMode = args.getEvalMode(isInput, index);
-
-  if (evalMode == CEED_EVAL_GRAD) {
-    ss << "    CeedScalar " << indexedVar(name, index) << "[" << opField.getDim() << "]"
-       << "[" << opField.getComponentCount() << "]"
-       << "[OCCA_Q];" << std::endl;
-  } else if (evalMode == CEED_EVAL_INTERP) {
-    ss << "    CeedScalar " << indexedVar(name, index) << "[" << opField.getComponentCount() << "]"
-       << "[OCCA_Q];" << std::endl;
-  } else {
-    const QFunctionField &qfField = args.getQfField(isInput, index);
-
-    ss << "    CeedScalar " << indexedVar(name, index) << "[OCCA_Q * " << qfField.size << "];" << std::endl;
-  }
-}
-
-void CpuOperator::addQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int count, const std::string &name) {
-  // Output:
-  //   CeedScalar *quadInputs[2] = {
-  //     (CeedScalar*) quadInput0,
-  //     (CeedScalar*) quadInput1
-  //   };
-
-  // Add an 's': quadInput -> quadInputs
-  const std::string arrayName = name + "s";
-
-  ss << "    CeedScalar *" << arrayName << "[" << count << "] = {" << std::endl;
-  for (int i = 0; i < count; ++i) {
-    if (i) {
-      ss << ',' << std::endl;
-    }
-    ss << "      (CeedScalar*) " << indexedVar(name, i);
-  }
-  ss << std::endl << "    };" << std::endl;
-}
-
-void CpuOperator::addInputSetupSource(std::stringstream &ss) {
-  const bool isInput = true;
-  addBasisApplySource(ss, isInput, args.inputCount());
-}
-
-void CpuOperator::addOutputSetupSource(std::stringstream &ss) {
-  const bool isInput = false;
-  addBasisApplySource(ss, isInput, args.outputCount());
-}
-
-void CpuOperator::addBasisApplySource(std::stringstream &ss, const bool isInput, const int count) {
-  for (int i = 0; i < count; ++i) {
-    CeedEvalMode evalMode = args.getEvalMode(isInput, i);
-
-    if (evalMode == CEED_EVAL_INTERP) {
-      addInterpSource(ss, isInput, i);
-    } else if (evalMode == CEED_EVAL_GRAD) {
-      const bool hasTensorBasis = args.getOpField(isInput, i).usingTensorBasis();
-      if (hasTensorBasis) {
-        addGradTensorSource(ss, isInput, i);
-      } else {
-        addGradSimplexSource(ss, isInput, i);
-      }
-    } else if (evalMode == CEED_EVAL_WEIGHT) {
-      addWeightSource(ss, isInput, i);
-    } else if (evalMode == CEED_EVAL_NONE) {
-      addCopySource(ss, isInput, i);
-    }
-  }
-}
-
-void CpuOperator::addInterpSource(std::stringstream &ss, const bool isInput, const int index) {
-  const OperatorField &opField          = args.getOpField(isInput, index);
-  const bool           usingTensorBasis = opField.usingTensorBasis();
-  const int            components       = opField.getComponentCount();
-  const int            dim              = opField.getDim();
-
-  const std::string weights = interpVar(isInput, index);
-
-  std::string dimArgs;
-  if (usingTensorBasis) {
-    for (int i = 0; i < dim; ++i) {
-      if (i) {
-        dimArgs += ", ";
-      }
-      dimArgs += '0';
-    }
-  } else {
-    dimArgs = "0";
-  }
-
-  std::string input, output;
-  if (isInput) {
-    input  = "&" + dofInputVar(index) + "(" + dimArgs + ", component, element)";
-    output = "(CeedScalar*) " + indexedVar("quadInput", index) + "[component]";
-  } else {
-    input  = "(CeedScalar*) " + indexedVar("quadOutput", index) + "[component]";
-    output = "&" + dofOutputVar(index) + "(" + dimArgs + ", component, element)";
-  }
-
-  ss << "    // Applying interp (" << xputName(isInput) << ": " << index << ")" << std::endl
-     << "    for (int component = 0; component < " << components << "; ++component) {" << std::endl
-     << "      " << elementFunction(isInput, index) << "(" << std::endl
-     << "        " << weights << ',' << std::endl
-     << "        " << input << ',' << std::endl
-     << "        " << output << std::endl
-     << "      );" << std::endl
-     << "    }" << std::endl
-     << std::endl;
-}
-
-void CpuOperator::addGradTensorSource(std::stringstream &ss, const bool isInput, const int index) {
-  const OperatorField &opField    = args.getOpField(isInput, index);
-  const int            components = opField.getComponentCount();
-  const int            dim        = opField.getDim();
-
-  const std::string B  = interpVar(isInput, index);
-  const std::string Bx = gradVar(isInput, index);
-
-  std::string dimArgs;
-  for (int i = 0; i < dim; ++i) {
-    if (i) {
-      dimArgs += ", ";
-    }
-    dimArgs += '0';
-  }
-
-  std::string inputs, outputs;
-  if (isInput) {
-    inputs = "&" + dofInputVar(index) + "(" + dimArgs + ", component, element)";
-
-    for (int i = 0; i < dim; ++i) {
-      if (i) {
-        outputs += ",\n        ";
-      }
-      const std::string iStr = std::to_string(i);
-      outputs += "(CeedScalar*) " + indexedVar("quadInput", index) + "[" + iStr + "][component]";
-    }
-  } else {
-    for (int i = 0; i < dim; ++i) {
-      if (i) {
-        inputs += ",\n        ";
-      }
-      const std::string iStr = std::to_string(i);
-      inputs += "(CeedScalar*) " + indexedVar("quadOutput", index) + "[" + iStr + "][component]";
-    }
-
-    outputs = "&" + dofOutputVar(index) + "(" + dimArgs + ", component, element)";
-  }
-
-  ss << "    // Applying grad-tensor (" << xputName(isInput) << ": " << index << ")" << std::endl
-     << "    for (int component = 0; component < " << components << "; ++component) {" << std::endl
-     << "      " << elementFunction(isInput, index) << "(" << std::endl
-     << "        " << B << ',' << std::endl
-     << "        " << Bx << ',' << std::endl
-     << "        " << inputs << ',' << std::endl
-     << "        " << outputs << std::endl
-     << "      );" << std::endl
-     << "    }" << std::endl
-     << std::endl;
-}
-
-void CpuOperator::addGradSimplexSource(std::stringstream &ss, const bool isInput, const int index) {
-  const int components = (args.getOpField(isInput, index).getComponentCount());
-
-  const std::string weights = gradVar(isInput, index);
-
-  std::string input, output;
-  if (isInput) {
-    input  = "&" + dofInputVar(index) + "(0, component, element)";
-    output = "(CeedScalar*) " + indexedVar("quadInput", index) + "[component]";
-  } else {
-    input  = "(CeedScalar*) " + indexedVar("quadOutput", index) + "[component]";
-    output = "&" + dofOutputVar(index) + "(0, component, element)";
-  }
-
-  ss << "    // Applying grad-simplex (" << xputName(isInput) << ": " << index << ")" << std::endl
-     << "    for (int component = 0; component < " << components << "; ++component) {" << std::endl
-     << "      " << elementFunction(isInput, index) << "(" << std::endl
-     << "        " << weights << ',' << std::endl
-     << "        " << input << ',' << std::endl
-     << "        " << output << std::endl
-     << "      );" << std::endl
-     << "    }" << std::endl
-     << std::endl;
-}
-
-void CpuOperator::addWeightSource(std::stringstream &ss, const bool isInput, const int index) {
-  const std::string weights = qWeightVar(isInput, index);
-
-  std::string output;
-  if (isInput) {
-    // TODO: Can the weight operator handle multiple components?
-    output = "(CeedScalar*) " + indexedVar("quadInput", index);
-  } else {
-    output = "&" + dofOutputVar(index) + "(0, element)";
-  }
-
-  ss << "    // Applying weight (" << xputName(isInput) << ": " << index << ")" << std::endl
-     << "    " << elementFunction(isInput, index) << "(" << std::endl
-     << "      " << weights << ',' << std::endl
-     << "      " << output << std::endl
-     << "    );" << std::endl
-     << std::endl;
-}
-
-void CpuOperator::addCopySource(std::stringstream &ss, const bool isInput, const int index) {
-  const QFunctionField &qfField = args.getQfField(isInput, index);
-  const std::string     size    = std::to_string(qfField.size);
-
-  std::string input, output;
-  if (isInput) {
-    input += dofInputVar(index) + "[q + (OCCA_Q * (field + element * " + size + "))]";
-    output += indexedVar("quadInput", index) + "[q + field * OCCA_Q]";
-  } else {
-    input  = indexedVar("quadOutput", index) + "[q + field * OCCA_Q]";
-    output = dofOutputVar(index) + "[q + (OCCA_Q * (field + element * " + size + "))]";
-  }
-
-  ss << "    // Copying source directly (" << xputName(isInput) << ": " << index << ")" << std::endl
-     << "    for (int field = 0; field < " << size << "; ++field) {" << std::endl
-     << "      for (int q = 0; q < OCCA_Q; ++q) {" << std::endl
-     << "        " << output << " = " << input << ";" << std::endl
-     << "      }" << std::endl
-     << "    }" << std::endl
-     << std::endl;
-}
-
-void CpuOperator::addQFunctionApplicationSource(std::stringstream &ss) {
-  ss << "    // Apply qFunction" << std::endl
-     << "    " << qfunction->qFunctionName << "(ctx, OCCA_Q, quadInputs, quadOutputs);" << std::endl
-     << std::endl;
-}
-
-//  ---[ Variables ]-----------------
-std::string CpuOperator::elementFunction(const bool isInput, const int index) {
-  return fullFieldFunctionName(isInput, args.getOpField(isInput, index), args.getQfField(isInput, index));
-}
-
-std::string CpuOperator::fieldFunctionName(const QFunctionField &qfField) {
-  switch (qfField.evalMode) {
-    case CEED_EVAL_INTERP:
-      return "interp";
-    case CEED_EVAL_GRAD:
-      return "grad";
-    case CEED_EVAL_WEIGHT:
-      return "weight";
-    default:
-      return "none";
-  }
-}
-
-std::string CpuOperator::fullFieldFunctionName(const bool isInput, const OperatorField &opField, const QFunctionField &qfField) {
-  // Output:
-  //   - tensor_1d_interpElement_Q2_P2
-  //   - simplex_1d_interpElementTranspose_Q2_P2
-
-  const bool        usingTensorBasis = opField.usingTensorBasis();
-  std::stringstream ss;
-  int               dim, Q, P;
-
-  if (usingTensorBasis) {
-    TensorBasis &basis = *((TensorBasis *)opField.basis);
-    dim                = basis.dim;
-    Q                  = basis.Q1D;
-    P                  = basis.P1D;
-    ss << "tensor_";
-  } else {
-    SimplexBasis &basis = *((SimplexBasis *)opField.basis);
-    dim                 = basis.dim;
-    Q                   = basis.Q;
-    P                   = basis.P;
-    ss << "simplex_";
-  }
-
-  ss << dim << "d_" << fieldFunctionName(qfField) << "Element";
-
-  if (!isInput) {
-    ss << "Transpose";
-  }
-
-  ss << "_Q" << Q << "_P" << P;
-
-  return ss.str();
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-cpu-operator.hpp b/backends/occa/ceed-occa-cpu-operator.hpp
deleted file mode 100644
index e7e79b059c..0000000000
--- a/backends/occa/ceed-occa-cpu-operator.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_CPU_OPERATOR_HEADER
-#define CEED_OCCA_CPU_OPERATOR_HEADER
-
-#include <sstream>
-#include <vector>
-
-#include "ceed-occa-operator.hpp"
-#include "ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-class Basis;
-class SimplexBasis;
-class TensorBasis;
-
-class CpuOperator : public Operator {
- private:
-  typedef std::vector<Vector *> VectorVector;
-  typedef std::vector<Basis *>  BasisVector;
-
-  VectorVector dofInputs, dofOutputs;
-
- public:
-  CpuOperator();
-
-  ~CpuOperator();
-
-  // Setup helper vectors
-  void setupVectors();
-
-  void setupVectors(const int fieldCount, OperatorFieldVector &opFields, QFunctionFieldVector &qfFields, VectorVector &vectors);
-
-  void freeVectors();
-
-  // Restriction operators
-  void setupInputs(Vector *in);
-
-  void setupOutputs(Vector *out);
-
-  void applyQFunction();
-
-  // Push arguments for a given field
-  void pushKernelArgs(Vector *vec, const bool isInput, const int index);
-
-  void pushTensorBasisKernelArgs(const QFunctionField &qfField, TensorBasis &basis);
-
-  void pushSimplexBasisKernelArgs(const QFunctionField &qfField, SimplexBasis &basis);
-
-  // Set props for a given field
-  ::occa::properties getKernelProps();
-
-  void applyAdd(Vector *in, Vector *out);
-
-  ::occa::kernel buildApplyAddKernel();
-
-  //---[ Kernel Generation ]------------------
-  void addBasisFunctionSource(std::stringstream &ss);
-
-  void addBasisIfMissingSource(BasisVector &sourceBasis, Basis *basis);
-
-  void addKernelSource(std::stringstream &ss);
-
-  void addKernelArgsSource(std::stringstream &ss);
-
-  void addKernelArgSource(std::stringstream &ss, const bool isInput, const int index);
-
-  void addTensorKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField,
-                                const QFunctionField &qfField, std::stringstream &dimAttribute);
-
-  void addSimplexKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField,
-                                 const QFunctionField &qfField, std::stringstream &dimAttribute);
-
-  void addQuadArraySource(std::stringstream &ss);
-
-  void addSingleQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int index, const std::string &name);
-
-  void addQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int count, const std::string &name);
-
-  void addInputSetupSource(std::stringstream &ss);
-
-  void addOutputSetupSource(std::stringstream &ss);
-
-  void addBasisApplySource(std::stringstream &ss, const bool isInput, const int count);
-
-  void addInterpSource(std::stringstream &ss, const bool isInput, const int index);
-
-  void addGradTensorSource(std::stringstream &ss, const bool isInput, const int index);
-
-  void addGradSimplexSource(std::stringstream &ss, const bool isInput, const int index);
-
-  void addWeightSource(std::stringstream &ss, const bool isInput, const int index);
-
-  void addCopySource(std::stringstream &ss, const bool isInput, const int index);
-
-  void addQFunctionApplicationSource(std::stringstream &ss);
-
-  //  ---[ Variables ]---------------
-  inline std::string xputName(const bool isInput) { return isInput ? "input" : "output"; }
-
-  inline std::string indexedVar(const std::string &name, const int index) { return name + std::to_string(index); }
-
-  inline std::string indexedVar(const std::string &name, const bool isInput, const int index) {
-    return (isInput ? "input" : "output") + std::to_string(index) + "_" + name;
-  }
-
-  inline std::string dofInputVar(const int index) { return indexedVar("dofInput", index); }
-
-  inline std::string dofOutputVar(const int index) { return indexedVar("dofOutput", index); }
-
-  inline std::string interpVar(const bool isInput, const int index) { return indexedVar("B", isInput, index); }
-
-  inline std::string gradVar(const bool isInput, const int index) { return indexedVar("Bx", isInput, index); }
-
-  inline std::string qWeightVar(const bool isInput, const int index) { return indexedVar("qWeights", isInput, index); }
-
-  std::string elementFunction(const bool isInput, const int index);
-
-  std::string fieldFunctionName(const QFunctionField &qfField);
-
-  std::string fullFieldFunctionName(const bool isInput, const OperatorField &opField, const QFunctionField &qfField);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-elem-restriction.cpp b/backends/occa/ceed-occa-elem-restriction.cpp
deleted file mode 100644
index 7bfae3d87f..0000000000
--- a/backends/occa/ceed-occa-elem-restriction.cpp
+++ /dev/null
@@ -1,372 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "./ceed-occa-elem-restriction.hpp"
-
-#include <cstring>
-#include <map>
-
-#include "./ceed-occa-kernels.hpp"
-#include "./ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-ElemRestriction::ElemRestriction()
-    : ceedElementCount(0),
-      ceedElementSize(0),
-      ceedComponentCount(0),
-      ceedLVectorSize(0),
-      ceedNodeStride(0),
-      ceedComponentStride(0),
-      ceedElementStride(0),
-      ceedUnstridedComponentStride(0),
-      freeHostIndices(true),
-      hostIndices(NULL),
-      freeIndices(true) {}
-
-ElemRestriction::~ElemRestriction() {
-  if (freeHostIndices) {
-    CeedFree(&hostIndices);
-  }
-  if (freeIndices) {
-    indices.free();
-  }
-}
-
-void ElemRestriction::setup(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput) {
-  if (memType == CEED_MEM_HOST) {
-    setupFromHostMemory(copyMode, indicesInput);
-  } else {
-    setupFromDeviceMemory(copyMode, indicesInput);
-  }
-
-  setupTransposeIndices();
-}
-
-void ElemRestriction::setupFromHostMemory(CeedCopyMode copyMode, const CeedInt *indices_h) {
-  const CeedInt entries = ceedElementCount * ceedElementSize;
-
-  freeHostIndices = (copyMode == CEED_OWN_POINTER || copyMode == CEED_COPY_VALUES);
-
-  if (copyMode != CEED_COPY_VALUES) {
-    hostIndices = const_cast<CeedInt *>(indices_h);
-  } else {
-    const size_t bytes = entries * sizeof(CeedInt);
-    hostIndices        = (CeedInt *)::malloc(bytes);
-    std::memcpy(hostIndices, indices_h, bytes);
-  }
-
-  if (hostIndices) {
-    indices = getDevice().malloc<CeedInt>(entries, hostIndices);
-  }
-}
-
-void ElemRestriction::setupFromDeviceMemory(CeedCopyMode copyMode, const CeedInt *indices_d) {
-  ::occa::memory deviceIndices = arrayToMemory(indices_d);
-
-  freeIndices = (copyMode == CEED_OWN_POINTER);
-
-  if (copyMode == CEED_COPY_VALUES) {
-    indices = deviceIndices.clone();
-  } else {
-    indices = deviceIndices;
-  }
-}
-
-bool ElemRestriction::usesIndices() { return indices.isInitialized(); }
-
-void ElemRestriction::setupTransposeIndices() {
-  if (!usesIndices() || transposeQuadIndices.isInitialized()) {
-    return;
-  }
-
-  const CeedInt elementEntryCount = ceedElementCount * ceedElementSize;
-
-  bool *indexIsUsed = new bool[ceedLVectorSize];
-  std::memset(indexIsUsed, 0, ceedLVectorSize * sizeof(bool));
-
-  for (CeedInt i = 0; i < elementEntryCount; ++i) {
-    indexIsUsed[hostIndices[i]] = true;
-  }
-
-  CeedInt nodeCount = 0;
-  for (CeedInt i = 0; i < ceedLVectorSize; ++i) {
-    nodeCount += indexIsUsed[i];
-  }
-
-  const CeedInt dofOffsetCount         = nodeCount + 1;
-  CeedInt      *quadIndexToDofOffset   = new CeedInt[ceedLVectorSize];
-  CeedInt      *transposeQuadIndices_h = new CeedInt[nodeCount];
-  CeedInt      *transposeDofOffsets_h  = new CeedInt[dofOffsetCount];
-  CeedInt      *transposeDofIndices_h  = new CeedInt[elementEntryCount];
-
-  std::memset(transposeDofOffsets_h, 0, dofOffsetCount * sizeof(CeedInt));
-
-  // Compute ids
-  CeedInt offsetId = 0;
-  for (CeedInt i = 0; i < ceedLVectorSize; ++i) {
-    if (indexIsUsed[i]) {
-      transposeQuadIndices_h[offsetId] = i;
-      quadIndexToDofOffset[i]          = offsetId++;
-    }
-  }
-
-  // Count how many times a specific quad node is used
-  for (CeedInt i = 0; i < elementEntryCount; ++i) {
-    ++transposeDofOffsets_h[quadIndexToDofOffset[hostIndices[i]] + 1];
-  }
-
-  // Aggregate to find true offsets
-  for (CeedInt i = 1; i < dofOffsetCount; ++i) {
-    transposeDofOffsets_h[i] += transposeDofOffsets_h[i - 1];
-  }
-
-  // Compute dof indices
-  for (CeedInt i = 0; i < elementEntryCount; ++i) {
-    const CeedInt quadIndex         = hostIndices[i];
-    const CeedInt dofIndex          = transposeDofOffsets_h[quadIndexToDofOffset[quadIndex]]++;
-    transposeDofIndices_h[dofIndex] = i;
-  }
-
-  // Reset offsets
-  for (int i = dofOffsetCount - 1; i > 0; --i) {
-    transposeDofOffsets_h[i] = transposeDofOffsets_h[i - 1];
-  }
-  transposeDofOffsets_h[0] = 0;
-
-  // Copy to device
-  ::occa::device device = getDevice();
-
-  transposeQuadIndices = device.malloc<CeedInt>(nodeCount, transposeQuadIndices_h);
-  transposeDofOffsets  = device.malloc<CeedInt>(dofOffsetCount, transposeDofOffsets_h);
-  transposeDofIndices  = device.malloc<CeedInt>(elementEntryCount, transposeDofIndices_h);
-
-  // Clean up temporary arrays
-  delete[] indexIsUsed;
-  delete[] quadIndexToDofOffset;
-  delete[] transposeQuadIndices_h;
-  delete[] transposeDofOffsets_h;
-  delete[] transposeDofIndices_h;
-}
-
-void ElemRestriction::setKernelProperties() {
-  kernelProperties["defines/CeedInt"]                    = ::occa::dtype::get<CeedInt>().name();
-  kernelProperties["defines/CeedScalar"]                 = ::occa::dtype::get<CeedScalar>().name();
-  kernelProperties["defines/COMPONENT_COUNT"]            = ceedComponentCount;
-  kernelProperties["defines/ELEMENT_SIZE"]               = ceedElementSize;
-  kernelProperties["defines/TILE_SIZE"]                  = 64;
-  kernelProperties["defines/USES_INDICES"]               = usesIndices();
-  kernelProperties["defines/USER_STRIDES"]               = StrideType::USER_STRIDES;
-  kernelProperties["defines/NOT_STRIDED"]                = StrideType::NOT_STRIDED;
-  kernelProperties["defines/BACKEND_STRIDES"]            = StrideType::BACKEND_STRIDES;
-  kernelProperties["defines/STRIDE_TYPE"]                = ceedStrideType;
-  kernelProperties["defines/NODE_COUNT"]                 = transposeQuadIndices.length();
-  kernelProperties["defines/NODE_STRIDE"]                = ceedNodeStride;
-  kernelProperties["defines/COMPONENT_STRIDE"]           = ceedComponentStride;
-  kernelProperties["defines/ELEMENT_STRIDE"]             = ceedElementStride;
-  kernelProperties["defines/UNSTRIDED_COMPONENT_STRIDE"] = ceedUnstridedComponentStride;
-}
-
-ElemRestriction *ElemRestriction::getElemRestriction(CeedElemRestriction r, const bool assertValid) {
-  if (!r || r == CEED_ELEMRESTRICTION_NONE) {
-    return NULL;
-  }
-
-  int              ierr;
-  ElemRestriction *elemRestriction = NULL;
-
-  ierr = CeedElemRestrictionGetData(r, (void **)&elemRestriction);
-  if (assertValid) {
-    CeedOccaFromChk(ierr);
-  }
-
-  return elemRestriction;
-}
-
-ElemRestriction *ElemRestriction::from(CeedElemRestriction r) {
-  ElemRestriction *elemRestriction = getElemRestriction(r);
-  if (!elemRestriction) {
-    return NULL;
-  }
-
-  CeedCallOcca(CeedElemRestrictionGetCeed(r, &elemRestriction->ceed));
-
-  return elemRestriction->setupFrom(r);
-}
-
-ElemRestriction *ElemRestriction::from(CeedOperatorField operatorField) {
-  CeedElemRestriction ceedElemRestriction;
-
-  CeedCallOcca(CeedOperatorFieldGetElemRestriction(operatorField, &ceedElemRestriction));
-
-  return from(ceedElemRestriction);
-}
-
-ElemRestriction *ElemRestriction::setupFrom(CeedElemRestriction r) {
-  CeedCallOcca(CeedElemRestrictionGetNumElements(r, &ceedElementCount));
-
-  CeedCallOcca(CeedElemRestrictionGetElementSize(r, &ceedElementSize));
-
-  CeedCallOcca(CeedElemRestrictionGetNumComponents(r, &ceedComponentCount));
-
-  CeedCallOcca(CeedElemRestrictionGetLVectorSize(r, &ceedLVectorSize));
-
-  // Find what type of striding the restriction uses
-  bool isStrided         = false;
-  bool hasBackendStrides = false;
-
-  CeedCallOcca(CeedElemRestrictionIsStrided(r, &isStrided));
-
-  if (isStrided) {
-    CeedCallOcca(CeedElemRestrictionHasBackendStrides(r, &hasBackendStrides));
-  }
-
-  if (isStrided) {
-    if (hasBackendStrides) {
-      ceedStrideType = BACKEND_STRIDES;
-    } else {
-      ceedStrideType = USER_STRIDES;
-    }
-  } else {
-    ceedStrideType = NOT_STRIDED;
-  }
-
-  // Default strides
-  ceedNodeStride               = 1;
-  ceedComponentStride          = ceedElementSize;
-  ceedElementStride            = ceedElementSize * ceedComponentCount;
-  ceedUnstridedComponentStride = 1;
-
-  if (ceedStrideType == USER_STRIDES) {
-    CeedInt strides[3];
-
-    CeedCallOcca(CeedElemRestrictionGetStrides(r, strides));
-
-    ceedNodeStride      = strides[0];
-    ceedComponentStride = strides[1];
-    ceedElementStride   = strides[2];
-
-  } else if (ceedStrideType == NOT_STRIDED) {
-    CeedCallOcca(CeedElemRestrictionGetCompStride(r, &ceedUnstridedComponentStride));
-  }
-
-  return this;
-}
-
-int ElemRestriction::apply(CeedTransposeMode rTransposeMode, Vector &u, Vector &v) {
-  const bool rIsTransposed = (rTransposeMode != CEED_NOTRANSPOSE);
-
-  // Todo: refactor
-  if (rIsTransposed) {
-    if (!restrictionTransposeKernel.isInitialized()) {
-      setKernelProperties();
-      restrictionTransposeKernel = getDevice().buildKernelFromString(occa_elem_restriction_source, "applyRestrictionTranspose", kernelProperties);
-    }
-    restrictionTransposeKernel(ceedElementCount, transposeQuadIndices, transposeDofOffsets, transposeDofIndices, u.getConstKernelArg(),
-                               v.getKernelArg());
-  } else {
-    if (!restrictionKernel.isInitialized()) {
-      setKernelProperties();
-      restrictionKernel = getDevice().buildKernelFromString(occa_elem_restriction_source, "applyRestriction", kernelProperties);
-    }
-    restrictionKernel(ceedElementCount, indices, u.getConstKernelArg(), v.getKernelArg());
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int ElemRestriction::getOffsets(CeedMemType memType, const CeedInt **offsets) {
-  switch (memType) {
-    case CEED_MEM_HOST: {
-      *offsets = hostIndices;
-      return CEED_ERROR_SUCCESS;
-    }
-    case CEED_MEM_DEVICE: {
-      *offsets = memoryToArray<CeedInt>(indices);
-      return CEED_ERROR_SUCCESS;
-    }
-  }
-  return ceedError("Unsupported CeedMemType passed to ElemRestriction::getOffsets");
-}
-
-//---[ Ceed Callbacks ]-----------
-int ElemRestriction::registerCeedFunction(Ceed ceed, CeedElemRestriction r, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "ElemRestriction", r, fname, f);
-}
-
-int ElemRestriction::ceedCreate(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput, const bool *orientsInput,
-                                const CeedInt8 *curlOrientsInput, CeedElemRestriction r) {
-  Ceed ceed;
-  CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
-
-  if ((memType != CEED_MEM_DEVICE) && (memType != CEED_MEM_HOST)) {
-    return staticCeedError("Only HOST and DEVICE CeedMemType supported");
-  }
-
-  CeedRestrictionType rstr_type;
-  CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type));
-  if ((rstr_type == CEED_RESTRICTION_ORIENTED) || (rstr_type == CEED_RESTRICTION_CURL_ORIENTED)) {
-    return staticCeedError("(OCCA) Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented");
-  }
-
-  ElemRestriction *elemRestriction = new ElemRestriction();
-  CeedCallBackend(CeedElemRestrictionSetData(r, elemRestriction));
-
-  // Setup Ceed objects before setting up memory
-  elemRestriction = ElemRestriction::from(r);
-  elemRestriction->setup(memType, copyMode, indicesInput);
-
-  CeedInt defaultLayout[3] = {1, elemRestriction->ceedElementSize, elemRestriction->ceedElementSize * elemRestriction->ceedComponentCount};
-  CeedCallBackend(CeedElemRestrictionSetELayout(r, defaultLayout));
-
-  CeedOccaRegisterFunction(r, "Apply", ElemRestriction::ceedApply);
-  CeedOccaRegisterFunction(r, "ApplyUnsigned", ElemRestriction::ceedApply);
-  CeedOccaRegisterFunction(r, "ApplyUnoriented", ElemRestriction::ceedApply);
-  CeedOccaRegisterFunction(r, "ApplyBlock", ElemRestriction::ceedApplyBlock);
-  CeedOccaRegisterFunction(r, "GetOffsets", ElemRestriction::ceedGetOffsets);
-  CeedOccaRegisterFunction(r, "Destroy", ElemRestriction::ceedDestroy);
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int ElemRestriction::ceedApply(CeedElemRestriction r, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request) {
-  ElemRestriction *elemRestriction = ElemRestriction::from(r);
-  Vector          *uVector         = Vector::from(u);
-  Vector          *vVector         = Vector::from(v);
-
-  if (!elemRestriction) {
-    return staticCeedError("Incorrect CeedElemRestriction argument: r");
-  }
-  if (!uVector) {
-    return elemRestriction->ceedError("Incorrect CeedVector argument: u");
-  }
-  if (!vVector) {
-    return elemRestriction->ceedError("Incorrect CeedVector argument: v");
-  }
-
-  return elemRestriction->apply(tmode, *uVector, *vVector);
-}
-
-int ElemRestriction::ceedApplyBlock(CeedElemRestriction r, CeedInt block, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request) {
-  return staticCeedError("(OCCA) Backend does not implement CeedElemRestrictionApplyBlock");
-}
-
-int ElemRestriction::ceedGetOffsets(CeedElemRestriction r, CeedMemType memType, const CeedInt **offsets) {
-  ElemRestriction *elemRestriction = ElemRestriction::from(r);
-
-  if (!elemRestriction) {
-    return staticCeedError("Incorrect CeedElemRestriction argument: r");
-  }
-
-  return elemRestriction->getOffsets(memType, offsets);
-}
-
-int ElemRestriction::ceedDestroy(CeedElemRestriction r) {
-  delete getElemRestriction(r, false);
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-elem-restriction.hpp b/backends/occa/ceed-occa-elem-restriction.hpp
deleted file mode 100644
index 7ac03146b8..0000000000
--- a/backends/occa/ceed-occa-elem-restriction.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_ELEMRESTRICTION_HEADER
-#define CEED_OCCA_ELEMRESTRICTION_HEADER
-
-#include "ceed-occa-ceed-object.hpp"
-#include "ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-enum StrideType {
-  BACKEND_STRIDES = 0,
-  USER_STRIDES    = 1,
-  NOT_STRIDED     = 2,
-};
-
-class ElemRestriction : public CeedObject {
- public:
-  // Ceed object information
-  CeedInt    ceedElementCount;
-  CeedInt    ceedElementSize;
-  CeedInt    ceedComponentCount;
-  CeedSize   ceedLVectorSize;
-  StrideType ceedStrideType;
-  CeedInt    ceedNodeStride;
-  CeedInt    ceedComponentStride;
-  CeedInt    ceedElementStride;
-  CeedInt    ceedUnstridedComponentStride;
-
-  // Passed resources
-  bool     freeHostIndices;
-  CeedInt *hostIndices;
-
-  // Owned resources
-  bool           freeIndices;
-  ::occa::memory indices;
-
-  ::occa::memory transposeQuadIndices;
-  ::occa::memory transposeDofOffsets;
-  ::occa::memory transposeDofIndices;
-
-  ::occa::json   kernelProperties;
-  ::occa::kernel restrictionKernel;
-  ::occa::kernel restrictionTransposeKernel;
-
-  ElemRestriction();
-
-  ~ElemRestriction();
-
-  void setup(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput);
-
-  void setupFromHostMemory(CeedCopyMode copyMode, const CeedInt *indices_h);
-
-  void setupFromDeviceMemory(CeedCopyMode copyMode, const CeedInt *indices_d);
-
-  bool usesIndices();
-
-  void setupTransposeIndices();
-
-  void setKernelProperties();
-
-  static ElemRestriction *getElemRestriction(CeedElemRestriction r, const bool assertValid = true);
-
-  static ElemRestriction *from(CeedElemRestriction r);
-  static ElemRestriction *from(CeedOperatorField operatorField);
-  ElemRestriction        *setupFrom(CeedElemRestriction r);
-
-  int apply(CeedTransposeMode rTransposeMode, Vector &u, Vector &v);
-
-  int getOffsets(CeedMemType memType, const CeedInt **offsets);
-
-  //---[ Ceed Callbacks ]-----------
-  static int registerCeedFunction(Ceed ceed, CeedElemRestriction r, const char *fname, ceed::occa::ceedFunction f);
-
-  static int ceedCreate(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput, const bool *orientsInput,
-                        const CeedInt8 *curlOrientsInput, CeedElemRestriction r);
-
-  static int ceedApply(CeedElemRestriction r, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request);
-
-  static int ceedGetOffsets(CeedElemRestriction r, CeedMemType memType, const CeedInt **offsets);
-
-  static int ceedApplyBlock(CeedElemRestriction r, CeedInt block, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request);
-
-  static int ceedDestroy(CeedElemRestriction r);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-gpu-operator.cpp b/backends/occa/ceed-occa-gpu-operator.cpp
deleted file mode 100644
index af7a43becd..0000000000
--- a/backends/occa/ceed-occa-gpu-operator.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-gpu-operator.hpp"
-
-#include "ceed-occa-qfunction.hpp"
-
-namespace ceed {
-namespace occa {
-GpuOperator::GpuOperator() {}
-
-GpuOperator::~GpuOperator() {}
-
-::occa::kernel GpuOperator::buildApplyAddKernel() { return ::occa::kernel(); }
-
-void GpuOperator::applyAdd(Vector *in, Vector *out) {
-  // TODO: Implement
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-gpu-operator.hpp b/backends/occa/ceed-occa-gpu-operator.hpp
deleted file mode 100644
index fc14304975..0000000000
--- a/backends/occa/ceed-occa-gpu-operator.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_GPU_OPERATOR_HEADER
-#define CEED_OCCA_GPU_OPERATOR_HEADER
-
-#include <vector>
-
-#include "ceed-occa-operator.hpp"
-
-namespace ceed {
-namespace occa {
-class GpuOperator : public Operator {
- public:
-  GpuOperator();
-
-  ~GpuOperator();
-
-  ::occa::kernel buildApplyAddKernel();
-
-  void applyAdd(Vector *in, Vector *out);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-kernels.hpp b/backends/occa/ceed-occa-kernels.hpp
deleted file mode 100644
index 86469be1f1..0000000000
--- a/backends/occa/ceed-occa-kernels.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_KERNELS_HEADER
-#define CEED_OCCA_KERNELS_HEADER
-
-#include "./kernels/elem-restriction.hpp"
-#include "./kernels/set-value.hpp"
-#include "./kernels/simplex-basis.hpp"
-#include "./kernels/tensor-basis.hpp"
-
-#endif
diff --git a/backends/occa/ceed-occa-operator-args.cpp b/backends/occa/ceed-occa-operator-args.cpp
deleted file mode 100644
index 61199ce288..0000000000
--- a/backends/occa/ceed-occa-operator-args.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-operator-args.hpp"
-
-namespace ceed {
-namespace occa {
-OperatorArgs::OperatorArgs() : QFunctionArgs() {}
-
-OperatorArgs::OperatorArgs(CeedOperator op) : QFunctionArgs() { setupArgs(op); }
-
-void OperatorArgs::setupArgs(CeedOperator op) {
-  CeedQFunction      qf;
-  CeedOperatorField *ceedInputFields, *ceedOutputFields;
-
-  CeedCallOccaValid(_isValid, CeedOperatorGetQFunction(op, &qf));
-  setupQFunctionArgs(qf);
-
-  if (!_isValid) {
-    return;
-  }
-
-  CeedCallOccaValid(_isValid, CeedOperatorGetFields(op, NULL, &ceedInputFields, NULL, &ceedOutputFields));
-
-  for (int i = 0; i < _inputCount; ++i) {
-    OperatorField field = OperatorField(ceedInputFields[i]);
-    opInputs.push_back(field);
-    _isValid &= field.isValid();
-  }
-
-  for (int i = 0; i < _outputCount; ++i) {
-    OperatorField field = OperatorField(ceedOutputFields[i]);
-    opOutputs.push_back(field);
-    _isValid &= field.isValid();
-  }
-}
-
-const OperatorField &OperatorArgs::getOpField(const bool isInput, const int index) const { return isInput ? opInputs[index] : opOutputs[index]; }
-
-const OperatorField &OperatorArgs::getOpInput(const int index) const { return opInputs[index]; }
-
-const OperatorField &OperatorArgs::getOpOutput(const int index) const { return opOutputs[index]; }
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-operator-args.hpp b/backends/occa/ceed-occa-operator-args.hpp
deleted file mode 100644
index 5edf95188c..0000000000
--- a/backends/occa/ceed-occa-operator-args.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_OPERATORARGS_HEADER
-#define CEED_OCCA_OPERATORARGS_HEADER
-
-#include <vector>
-
-#include "ceed-occa-ceed-object.hpp"
-#include "ceed-occa-operator-field.hpp"
-#include "ceed-occa-qfunction-args.hpp"
-
-namespace ceed {
-namespace occa {
-typedef std::vector<OperatorField> OperatorFieldVector;
-
-class OperatorArgs : public QFunctionArgs {
- public:
-  OperatorFieldVector opInputs;
-  OperatorFieldVector opOutputs;
-
-  OperatorArgs();
-  OperatorArgs(CeedOperator op);
-
-  void setupArgs(CeedOperator op);
-
-  const OperatorField &getOpField(const bool isInput, const int index) const;
-
-  const OperatorField &getOpInput(const int index) const;
-
-  const OperatorField &getOpOutput(const int index) const;
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-operator-field.cpp b/backends/occa/ceed-occa-operator-field.cpp
deleted file mode 100644
index 6716d11e06..0000000000
--- a/backends/occa/ceed-occa-operator-field.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-operator-field.hpp"
-
-#include "ceed-occa-basis.hpp"
-#include "ceed-occa-elem-restriction.hpp"
-#include "ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-OperatorField::OperatorField(CeedOperatorField opField) : _isValid(false), _usesActiveVector(false), vec(NULL), basis(NULL), elemRestriction(NULL) {
-  CeedBasis           ceedBasis;
-  CeedVector          ceedVector;
-  CeedElemRestriction ceedElemRestriction;
-
-  CeedCallOccaValid(_isValid, CeedOperatorFieldGetBasis(opField, &ceedBasis));
-
-  CeedCallOccaValid(_isValid, CeedOperatorFieldGetVector(opField, &ceedVector));
-
-  CeedCallOccaValid(_isValid, CeedOperatorFieldGetElemRestriction(opField, &ceedElemRestriction));
-
-  _isValid          = true;
-  _usesActiveVector = ceedVector == CEED_VECTOR_ACTIVE;
-
-  vec             = Vector::from(ceedVector);
-  basis           = Basis::from(ceedBasis);
-  elemRestriction = ElemRestriction::from(ceedElemRestriction);
-}
-
-bool OperatorField::isValid() const { return _isValid; }
-
-//---[ Vector Info ]----------------
-bool OperatorField::usesActiveVector() const { return _usesActiveVector; }
-//==================================
-
-//---[ Basis Info ]-----------------
-bool OperatorField::hasBasis() const { return basis; }
-
-int OperatorField::usingTensorBasis() const { return basis->isTensorBasis(); }
-
-int OperatorField::getComponentCount() const { return (basis ? basis->ceedComponentCount : 1); }
-
-int OperatorField::getP() const { return (basis ? basis->P : 0); }
-
-int OperatorField::getQ() const { return (basis ? basis->Q : 0); }
-
-int OperatorField::getDim() const { return (basis ? basis->dim : 1); }
-//==================================
-
-//---[ ElemRestriction Info ]-------
-int OperatorField::getElementCount() const { return (elemRestriction ? elemRestriction->ceedElementCount : 1); }
-
-int OperatorField::getElementSize() const { return (elemRestriction ? elemRestriction->ceedElementSize : 1); }
-//==================================
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-operator-field.hpp b/backends/occa/ceed-occa-operator-field.hpp
deleted file mode 100644
index 4eeb5e70ed..0000000000
--- a/backends/occa/ceed-occa-operator-field.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_OPERATORFIELD_HEADER
-#define CEED_OCCA_OPERATORFIELD_HEADER
-
-#include "ceed-occa-context.hpp"
-
-namespace ceed {
-namespace occa {
-class Basis;
-class ElemRestriction;
-class Vector;
-
-class OperatorField {
- private:
-  bool _isValid;
-  bool _usesActiveVector;
-
- public:
-  Vector          *vec;
-  Basis           *basis;
-  ElemRestriction *elemRestriction;
-
-  OperatorField(CeedOperatorField opField);
-
-  bool isValid() const;
-
-  //---[ Vector Info ]--------------
-  bool usesActiveVector() const;
-  //================================
-
-  //---[ Basis Info ]---------------
-  bool hasBasis() const;
-  int  usingTensorBasis() const;
-
-  int getComponentCount() const;
-  int getP() const;
-  int getQ() const;
-  int getDim() const;
-  //================================
-
-  //---[ ElemRestriction Info ]-----
-  int getElementCount() const;
-  int getElementSize() const;
-  //================================
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-operator.cpp b/backends/occa/ceed-occa-operator.cpp
deleted file mode 100644
index c19e875033..0000000000
--- a/backends/occa/ceed-occa-operator.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-operator.hpp"
-
-#include "ceed-occa-basis.hpp"
-#include "ceed-occa-cpu-operator.hpp"
-#include "ceed-occa-elem-restriction.hpp"
-#include "ceed-occa-gpu-operator.hpp"
-#include "ceed-occa-qfunction.hpp"
-
-namespace ceed {
-namespace occa {
-Operator::Operator() : ceedQ(0), ceedElementCount(0), qfunction(NULL), needsInitialSetup(true) {}
-
-Operator::~Operator() {}
-
-Operator *Operator::getOperator(CeedOperator op, const bool assertValid) {
-  if (!op) {
-    return NULL;
-  }
-
-  int       ierr;
-  Operator *operator_ = NULL;
-
-  ierr = CeedOperatorGetData(op, (void **)&operator_);
-  if (assertValid) {
-    CeedOccaFromChk(ierr);
-  }
-
-  return operator_;
-}
-
-Operator *Operator::from(CeedOperator op) {
-  Operator *operator_ = getOperator(op);
-  if (!operator_) {
-    return NULL;
-  }
-
-  CeedCallOcca(CeedOperatorGetCeed(op, &operator_->ceed));
-
-  operator_->qfunction = QFunction::from(op);
-  if (!operator_->qfunction) {
-    return NULL;
-  }
-
-  CeedCallOcca(CeedOperatorGetNumQuadraturePoints(op, &operator_->ceedQ));
-  CeedCallOcca(CeedOperatorGetNumElements(op, &operator_->ceedElementCount));
-
-  operator_->args.setupArgs(op);
-  if (!operator_->args.isValid()) {
-    return NULL;
-  }
-
-  return operator_;
-}
-
-bool Operator::isApplyingIdentityFunction() { return qfunction->ceedIsIdentity; }
-
-int Operator::applyAdd(Vector *in, Vector *out, CeedRequest *request) {
-  // TODO: Cache kernel objects rather than relying on OCCA kernel caching
-  applyAddKernel = buildApplyAddKernel();
-
-  if (needsInitialSetup) {
-    initialSetup();
-    needsInitialSetup = false;
-  }
-
-  applyAdd(in, out);
-
-  return CEED_ERROR_SUCCESS;
-}
-
-//---[ Virtual Methods ]------------
-void Operator::initialSetup() {}
-
-//---[ Ceed Callbacks ]-------------
-int Operator::registerCeedFunction(Ceed ceed, CeedOperator op, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "Operator", op, fname, f);
-}
-
-int Operator::ceedCreate(CeedOperator op) {
-  Ceed ceed;
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-
-#if 1
-  Operator *operator_ = new CpuOperator();
-#else
-  // TODO: Add GPU specific operator
-  Operator *operator_ = (Context::from(ceed)->usingCpuDevice() ? ((Operator *)new CpuOperator()) : ((Operator *)new GpuOperator()));
-#endif
-
-  CeedCallBackend(CeedOperatorSetData(op, operator_));
-
-  CeedOccaRegisterFunction(op, "LinearAssembleQFunction", Operator::ceedLinearAssembleQFunction);
-  CeedOccaRegisterFunction(op, "LinearAssembleQFunctionUpdate", Operator::ceedLinearAssembleQFunction);
-  CeedOccaRegisterFunction(op, "LinearAssembleAddDiagonal", Operator::ceedLinearAssembleAddDiagonal);
-  CeedOccaRegisterFunction(op, "LinearAssembleAddPointBlockDiagonal", Operator::ceedLinearAssembleAddPointBlockDiagonal);
-  CeedOccaRegisterFunction(op, "CreateFDMElementInverse", Operator::ceedCreateFDMElementInverse);
-  CeedOccaRegisterFunction(op, "ApplyAdd", Operator::ceedApplyAdd);
-  CeedOccaRegisterFunction(op, "Destroy", Operator::ceedDestroy);
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int Operator::ceedCreateComposite(CeedOperator op) {
-  Ceed ceed;
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-
-  CeedOccaRegisterFunction(op, "LinearAssembleAddDiagonal", Operator::ceedLinearAssembleAddDiagonal);
-  CeedOccaRegisterFunction(op, "LinearAssembleAddPointBlockDiagonal", Operator::ceedLinearAssembleAddPointBlockDiagonal);
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int Operator::ceedLinearAssembleQFunction(CeedOperator op) { return staticCeedError("(OCCA) Backend does not implement LinearAssembleQFunction"); }
-
-int Operator::ceedLinearAssembleQFunctionUpdate(CeedOperator op) {
-  return staticCeedError("(OCCA) Backend does not implement LinearAssembleQFunctionUpdate");
-}
-
-int Operator::ceedLinearAssembleAddDiagonal(CeedOperator op) { return staticCeedError("(OCCA) Backend does not implement LinearAssembleDiagonal"); }
-
-int Operator::ceedLinearAssembleAddPointBlockDiagonal(CeedOperator op) {
-  return staticCeedError("(OCCA) Backend does not implement LinearAssemblePointBlockDiagonal");
-}
-
-int Operator::ceedCreateFDMElementInverse(CeedOperator op) { return staticCeedError("(OCCA) Backend does not implement CreateFDMElementInverse"); }
-
-int Operator::ceedApplyAdd(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request) {
-  Operator *operator_ = Operator::from(op);
-  Vector   *in        = Vector::from(invec);
-  Vector   *out       = Vector::from(outvec);
-
-  if (!operator_) {
-    return staticCeedError("Incorrect CeedOperator argument: op");
-  }
-
-  return operator_->applyAdd(in, out, request);
-}
-
-int Operator::ceedDestroy(CeedOperator op) {
-  delete getOperator(op, false);
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-operator.hpp b/backends/occa/ceed-occa-operator.hpp
deleted file mode 100644
index 5325bdf33d..0000000000
--- a/backends/occa/ceed-occa-operator.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_OPERATOR_HEADER
-#define CEED_OCCA_OPERATOR_HEADER
-
-#include <vector>
-
-#include "ceed-occa-ceed-object.hpp"
-#include "ceed-occa-operator-args.hpp"
-
-namespace ceed {
-namespace occa {
-typedef std::vector<ceed::occa::Vector *> VectorVector_t;
-
-class QFunction;
-
-class Operator : public CeedObject {
- public:
-  // Ceed object information
-  CeedInt ceedQ;
-  CeedInt ceedElementCount;
-
-  // Owned resources
-  QFunction     *qfunction;
-  OperatorArgs   args;
-  ::occa::kernel applyAddKernel;
-  bool           needsInitialSetup;
-
-  // Reference to other memory
-  ::occa::memory qFunctionContextData;
-
-  Operator();
-  virtual ~Operator();
-
-  static Operator *getOperator(CeedOperator op, const bool assertValid = true);
-
-  static Operator *from(CeedOperator op);
-
-  bool isApplyingIdentityFunction();
-
-  int applyAdd(Vector *in, Vector *out, CeedRequest *request);
-
-  //---[ Virtual Methods ]----------
-  virtual ::occa::kernel buildApplyAddKernel() = 0;
-
-  virtual void initialSetup();
-
-  virtual void applyAdd(Vector *in, Vector *out) = 0;
-
-  //---[ Ceed Callbacks ]-----------
-  static int registerCeedFunction(Ceed ceed, CeedOperator op, const char *fname, ceed::occa::ceedFunction f);
-
-  static int ceedCreate(CeedOperator op);
-  static int ceedCreateComposite(CeedOperator op);
-
-  static int ceedLinearAssembleQFunction(CeedOperator op);
-  static int ceedLinearAssembleQFunctionUpdate(CeedOperator op);
-  static int ceedLinearAssembleAddDiagonal(CeedOperator op);
-  static int ceedLinearAssembleAddPointBlockDiagonal(CeedOperator op);
-  static int ceedCreateFDMElementInverse(CeedOperator op);
-
-  static int ceedApplyAdd(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request);
-
-  static int ceedDestroy(CeedOperator op);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-qfunction-args.cpp b/backends/occa/ceed-occa-qfunction-args.cpp
deleted file mode 100644
index b8d2d9e936..0000000000
--- a/backends/occa/ceed-occa-qfunction-args.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-qfunction-args.hpp"
-
-namespace ceed {
-namespace occa {
-QFunctionArgs::QFunctionArgs() : _isValid(false), _inputCount(0), _outputCount(0) {}
-
-QFunctionArgs::QFunctionArgs(CeedQFunction qf) : _isValid(false), _inputCount(0), _outputCount(0) { setupQFunctionArgs(qf); }
-
-void QFunctionArgs::setupQFunctionArgs(CeedQFunction qf) {
-  CeedQFunctionField *ceedInputFields, *ceedOutputFields;
-
-  CeedCallOccaValid(_isValid, CeedQFunctionGetCeed(qf, &ceed));
-
-  CeedCallOccaValid(_isValid, CeedQFunctionGetNumArgs(qf, &_inputCount, &_outputCount));
-
-  CeedCallOccaValid(_isValid, CeedQFunctionGetFields(qf, NULL, &ceedInputFields, NULL, &ceedOutputFields));
-
-  _isValid = true;
-
-  for (int i = 0; i < _inputCount; ++i) {
-    QFunctionField field = QFunctionField(ceedInputFields[i]);
-    qfInputs.push_back(field);
-    _isValid &= field.isValid();
-  }
-
-  for (int i = 0; i < _outputCount; ++i) {
-    QFunctionField field = QFunctionField(ceedOutputFields[i]);
-    qfOutputs.push_back(field);
-    _isValid &= field.isValid();
-  }
-}
-
-bool QFunctionArgs::isValid() const { return _isValid; }
-
-int QFunctionArgs::inputCount() const { return _inputCount; }
-
-int QFunctionArgs::outputCount() const { return _outputCount; }
-
-const QFunctionField &QFunctionArgs::getQfField(const bool isInput, const int index) const { return isInput ? qfInputs[index] : qfOutputs[index]; }
-
-const QFunctionField &QFunctionArgs::getQfInput(const int index) const { return qfInputs[index]; }
-
-const QFunctionField &QFunctionArgs::getQfOutput(const int index) const { return qfOutputs[index]; }
-
-CeedEvalMode QFunctionArgs::getEvalMode(const bool isInput, const int index) const {
-  return isInput ? qfInputs[index].evalMode : qfOutputs[index].evalMode;
-}
-
-CeedEvalMode QFunctionArgs::getInputEvalMode(const int index) const { return qfInputs[index].evalMode; }
-
-CeedEvalMode QFunctionArgs::getOutputEvalMode(const int index) const { return qfOutputs[index].evalMode; }
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-qfunction-args.hpp b/backends/occa/ceed-occa-qfunction-args.hpp
deleted file mode 100644
index 77093ec93d..0000000000
--- a/backends/occa/ceed-occa-qfunction-args.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_QFUNCTIONARGS_HEADER
-#define CEED_OCCA_QFUNCTIONARGS_HEADER
-
-#include <vector>
-
-#include "ceed-occa-ceed-object.hpp"
-#include "ceed-occa-qfunction-field.hpp"
-
-namespace ceed {
-namespace occa {
-typedef std::vector<QFunctionField> QFunctionFieldVector;
-
-class QFunctionArgs : public CeedObject {
- protected:
-  bool    _isValid;
-  CeedInt _inputCount;
-  CeedInt _outputCount;
-
- public:
-  QFunctionFieldVector qfInputs;
-  QFunctionFieldVector qfOutputs;
-
-  QFunctionArgs();
-  QFunctionArgs(CeedQFunction qf);
-
-  void setupQFunctionArgs(CeedQFunction qf);
-
-  bool isValid() const;
-
-  int inputCount() const;
-  int outputCount() const;
-
-  const QFunctionField &getQfField(const bool isInput, const int index) const;
-
-  const QFunctionField &getQfInput(const int index) const;
-
-  const QFunctionField &getQfOutput(const int index) const;
-
-  CeedEvalMode getEvalMode(const bool isInput, const int index) const;
-
-  CeedEvalMode getInputEvalMode(const int index) const;
-
-  CeedEvalMode getOutputEvalMode(const int index) const;
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-qfunction-field.cpp b/backends/occa/ceed-occa-qfunction-field.cpp
deleted file mode 100644
index 7dada84ba8..0000000000
--- a/backends/occa/ceed-occa-qfunction-field.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-qfunction-field.hpp"
-
-namespace ceed {
-namespace occa {
-QFunctionField::QFunctionField(CeedQFunctionField qfField) : _isValid(false), size(0) {
-  CeedCallOccaValid(_isValid, CeedQFunctionFieldGetEvalMode(qfField, &evalMode));
-
-  CeedCallOccaValid(_isValid, CeedQFunctionFieldGetSize(qfField, &size));
-
-  _isValid = true;
-}
-
-bool QFunctionField::isValid() const { return _isValid; }
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-qfunction-field.hpp b/backends/occa/ceed-occa-qfunction-field.hpp
deleted file mode 100644
index 86eefd690e..0000000000
--- a/backends/occa/ceed-occa-qfunction-field.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_QFUNCTIONFIELD_HEADER
-#define CEED_OCCA_QFUNCTIONFIELD_HEADER
-
-#include "ceed-occa-context.hpp"
-
-namespace ceed {
-namespace occa {
-class QFunctionField {
- protected:
-  bool _isValid;
-
- public:
-  CeedEvalMode evalMode;
-  CeedInt      size;
-
-  QFunctionField(CeedQFunctionField qfField);
-
-  bool isValid() const;
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-qfunction.cpp b/backends/occa/ceed-occa-qfunction.cpp
deleted file mode 100644
index ac8e3b7386..0000000000
--- a/backends/occa/ceed-occa-qfunction.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-qfunction.hpp"
-
-#include <sstream>
-#include <string>
-
-#include "ceed-occa-qfunctioncontext.hpp"
-#include "ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-QFunction::QFunction(const std::string &source, const std::string &function_name) : ceedIsIdentity(false) {
-  filename      = source;
-  qFunctionName = function_name;
-}
-
-QFunction *QFunction::getQFunction(CeedQFunction qf, const bool assertValid) {
-  if (!qf) {
-    return NULL;
-  }
-
-  QFunction *qFunction = NULL;
-
-  CeedCallOcca(CeedQFunctionGetData(qf, &qFunction));
-
-  return qFunction;
-}
-
-QFunction *QFunction::from(CeedQFunction qf) {
-  QFunction *qFunction = getQFunction(qf);
-  if (!qFunction) {
-    return NULL;
-  }
-
-  CeedCallOcca(CeedQFunctionGetCeed(qf, &qFunction->ceed));
-
-  CeedCallOcca(CeedQFunctionGetInnerContext(qf, &qFunction->qFunctionContext));
-
-  CeedCallOcca(CeedQFunctionIsIdentity(qf, &qFunction->ceedIsIdentity));
-
-  qFunction->args.setupQFunctionArgs(qf);
-  if (!qFunction->args.isValid()) {
-    return NULL;
-  }
-
-  return qFunction;
-}
-
-QFunction *QFunction::from(CeedOperator op) {
-  if (!op) {
-    return NULL;
-  }
-
-  CeedQFunction qf;
-
-  CeedCallOcca(CeedOperatorGetQFunction(op, &qf));
-
-  return QFunction::from(qf);
-}
-
-::occa::properties QFunction::getKernelProps(const CeedInt Q) {
-  ::occa::properties props;
-
-  // Types
-  props["defines/CeedInt"]    = ::occa::dtype::get<CeedInt>().name();
-  props["defines/CeedScalar"] = ::occa::dtype::get<CeedScalar>().name();
-
-  // CEED defines
-  props["defines/CeedPragmaSIMD"]     = "";
-  props["defines/CEED_Q_VLA"]         = "OCCA_Q";
-  props["defines/CEED_ERROR_SUCCESS"] = 0;
-
-  std::stringstream ss;
-  ss << "#define CEED_QFUNCTION(FUNC_NAME) \\" << std::endl
-     << "  inline int FUNC_NAME" << std::endl
-     << "#define CEED_QFUNCTION_HELPER \\" << std::endl
-     << "  inline" << std::endl
-     << std::endl
-     << "#include \"" << filename << "\"" << std::endl;
-
-  props["headers"].asArray() += ss.str();
-
-  return props;
-}
-
-int QFunction::buildKernel(const CeedInt Q) {
-  // TODO: Store a kernel per Q
-  if (!qFunctionKernel.isInitialized()) {
-    ::occa::properties props = getKernelProps(Q);
-
-    // Properties only used in the QFunction kernel source
-    props["defines/OCCA_Q"] = Q;
-
-    const std::string kernelName = "qf_" + qFunctionName;
-
-    qFunctionKernel = (getDevice().buildKernelFromString(getKernelSource(kernelName, Q), kernelName, props));
-  }
-
-  return CEED_ERROR_SUCCESS;
-}
-
-std::string QFunction::getKernelSource(const std::string &kernelName, const CeedInt Q) {
-  std::stringstream ss;
-
-  ss << "@kernel" << std::endl << "void " << kernelName << "(" << std::endl;
-
-  // qfunction arguments
-  for (int i = 0; i < args.inputCount(); ++i) {
-    ss << "  const CeedScalar *in" << i << ',' << std::endl;
-  }
-  for (int i = 0; i < args.outputCount(); ++i) {
-    ss << "  CeedScalar *out" << i << ',' << std::endl;
-  }
-  ss << "  void *ctx" << std::endl;
-  ss << ") {" << std::endl;
-
-  // Iterate over Q and call qfunction
-  ss << "  @tile(128, @outer, @inner)" << std::endl
-     << "  for (int q = 0; q < OCCA_Q; ++q) {" << std::endl
-     << "    const CeedScalar* in[" << std::max(1, args.inputCount()) << "];" << std::endl
-     << "    CeedScalar* out[" << std::max(1, args.outputCount()) << "];" << std::endl;
-
-  // Set and define in for the q point
-  for (int i = 0; i < args.inputCount(); ++i) {
-    const CeedInt     fieldSize = args.getQfInput(i).size;
-    const std::string qIn_i     = "qIn" + std::to_string(i);
-    const std::string in_i      = "in" + std::to_string(i);
-
-    ss << "    CeedScalar " << qIn_i << "[" << fieldSize << "];" << std::endl
-       << "    in[" << i << "] = " << qIn_i << ";"
-       << std::endl
-       // Copy q data
-       << "    for (int qi = 0; qi < " << fieldSize << "; ++qi) {" << std::endl
-       << "      " << qIn_i << "[qi] = " << in_i << "[q + (OCCA_Q * qi)];" << std::endl
-       << "    }" << std::endl;
-  }
-
-  // Set out for the q point
-  for (int i = 0; i < args.outputCount(); ++i) {
-    const CeedInt     fieldSize = args.getQfOutput(i).size;
-    const std::string qOut_i    = "qOut" + std::to_string(i);
-
-    ss << "    CeedScalar " << qOut_i << "[" << fieldSize << "];" << std::endl << "    out[" << i << "] = " << qOut_i << ";" << std::endl;
-  }
-
-  ss << "    " << qFunctionName << "(ctx, 1, in, out);" << std::endl;
-
-  // Copy out for the q point
-  for (int i = 0; i < args.outputCount(); ++i) {
-    const CeedInt     fieldSize = args.getQfOutput(i).size;
-    const std::string qOut_i    = "qOut" + std::to_string(i);
-    const std::string out_i     = "out" + std::to_string(i);
-
-    ss << "    for (int qi = 0; qi < " << fieldSize << "; ++qi) {" << std::endl
-       << "      " << out_i << "[q + (OCCA_Q * qi)] = " << qOut_i << "[qi];" << std::endl
-       << "    }" << std::endl;
-  }
-
-  ss << "  }" << std::endl << "}";
-
-  return ss.str();
-}
-
-int QFunction::apply(CeedInt Q, CeedVector *U, CeedVector *V) {
-  CeedCallBackend(buildKernel(Q));
-
-  std::vector<CeedScalar *> outputArgs;
-
-  qFunctionKernel.clearArgs();
-
-  for (CeedInt i = 0; i < args.inputCount(); i++) {
-    Vector *u = Vector::from(U[i]);
-    if (!u) {
-      return ceedError("Incorrect qFunction input field: U[" + std::to_string(i) + "]");
-    }
-    qFunctionKernel.pushArg(u->getConstKernelArg());
-  }
-
-  for (CeedInt i = 0; i < args.outputCount(); i++) {
-    Vector *v = Vector::from(V[i]);
-    if (!v) {
-      return ceedError("Incorrect qFunction output field: V[" + std::to_string(i) + "]");
-    }
-    qFunctionKernel.pushArg(v->getKernelArg());
-  }
-  if (qFunctionContext) {
-    QFunctionContext *ctx = QFunctionContext::from(qFunctionContext);
-    qFunctionKernel.pushArg(ctx->getKernelArg());
-  } else {
-    qFunctionKernel.pushArg(::occa::null);
-  }
-
-  qFunctionKernel.run();
-
-  return CEED_ERROR_SUCCESS;
-}
-
-//---[ Ceed Callbacks ]-----------
-int QFunction::registerCeedFunction(Ceed ceed, CeedQFunction qf, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "QFunction", qf, fname, f);
-}
-
-int QFunction::ceedCreate(CeedQFunction qf) {
-  Ceed ceed;
-  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
-  Context *context;
-  CeedCallBackend(CeedGetData(ceed, &context));
-  const char *source;
-  CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source));
-  const char *function_name;
-  CeedCallBackend(CeedQFunctionGetKernelName(qf, &function_name));
-
-  QFunction *qFunction = new QFunction(source, function_name);
-  CeedCallBackend(CeedQFunctionSetData(qf, qFunction));
-
-  CeedOccaRegisterFunction(qf, "Apply", QFunction::ceedApply);
-  CeedOccaRegisterFunction(qf, "Destroy", QFunction::ceedDestroy);
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int QFunction::ceedApply(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) {
-  QFunction *qFunction = QFunction::from(qf);
-  if (qFunction) {
-    return qFunction->apply(Q, U, V);
-  }
-
-  return 1;
-}
-
-int QFunction::ceedDestroy(CeedQFunction qf) {
-  delete getQFunction(qf, false);
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-qfunction.hpp b/backends/occa/ceed-occa-qfunction.hpp
deleted file mode 100644
index 4af04c5bd7..0000000000
--- a/backends/occa/ceed-occa-qfunction.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_QFUNCTION_HEADER
-#define CEED_OCCA_QFUNCTION_HEADER
-
-#include "ceed-occa-ceed-object.hpp"
-#include "ceed-occa-qfunction-args.hpp"
-
-namespace ceed {
-namespace occa {
-class QFunction : public CeedObject {
- public:
-  // Ceed object information
-  bool ceedIsIdentity;
-
-  // Owned resources
-  std::string          filename;
-  std::string          qFunctionName;
-  ::occa::kernel       qFunctionKernel;
-  CeedQFunctionContext qFunctionContext;
-  QFunctionArgs        args;
-
-  QFunction(const std::string &source, const std::string &function_name);
-
-  static QFunction *getQFunction(CeedQFunction qf, const bool assertValid = true);
-
-  static QFunction *from(CeedQFunction qf);
-  static QFunction *from(CeedOperator op);
-
-  ::occa::properties getKernelProps(const CeedInt Q);
-
-  int         buildKernel(const CeedInt Q);
-  std::string getKernelSource(const std::string &kernelName, const CeedInt Q);
-
-  int apply(CeedInt Q, CeedVector *U, CeedVector *V);
-
-  //---[ Ceed Callbacks ]-----------
-  static int registerCeedFunction(Ceed ceed, CeedQFunction qf, const char *fname, ceed::occa::ceedFunction f);
-
-  static int ceedCreate(CeedQFunction qf);
-
-  static int ceedApply(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V);
-
-  static int ceedDestroy(CeedQFunction qf);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-qfunctioncontext.cpp b/backends/occa/ceed-occa-qfunctioncontext.cpp
deleted file mode 100644
index 017925f5a1..0000000000
--- a/backends/occa/ceed-occa-qfunctioncontext.cpp
+++ /dev/null
@@ -1,318 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-qfunctioncontext.hpp"
-
-#include <cstring>
-
-namespace ceed {
-namespace occa {
-QFunctionContext::QFunctionContext() : ctxSize(0), hostBuffer(NULL), currentHostBuffer(NULL), syncState(SyncState::none) {}
-
-QFunctionContext::~QFunctionContext() {
-  memory.free();
-  freeHostCtxBuffer();
-}
-
-QFunctionContext *QFunctionContext::getQFunctionContext(CeedQFunctionContext ctx, const bool assertValid) {
-  if (!ctx) {
-    return NULL;
-  }
-
-  int               ierr;
-  QFunctionContext *ctx_ = NULL;
-
-  ierr = CeedQFunctionContextGetBackendData(ctx, &ctx_);
-  if (assertValid) {
-    CeedOccaFromChk(ierr);
-  }
-
-  return ctx_;
-}
-
-QFunctionContext *QFunctionContext::from(CeedQFunctionContext ctx) {
-  QFunctionContext *ctx_ = getQFunctionContext(ctx);
-  if (!ctx_) {
-    return NULL;
-  }
-
-  CeedCallOcca(CeedQFunctionContextGetContextSize(ctx, &ctx_->ctxSize));
-
-  if (ctx_ != NULL) {
-    CeedCallOcca(CeedQFunctionContextGetCeed(ctx, &ctx_->ceed));
-  }
-
-  return ctx_;
-}
-
-void QFunctionContext::resizeCtx(const size_t ctxSize_) { ctxSize = ctxSize_; }
-
-void QFunctionContext::resizeCtxMemory(const size_t ctxSize_) { resizeCtxMemory(getDevice(), ctxSize_); }
-
-void QFunctionContext::resizeCtxMemory(::occa::device device, const size_t ctxSize_) {
-  if (ctxSize_ != memory.size()) {
-    memory.free();
-    memory = device.malloc(ctxSize_);
-  }
-}
-
-void QFunctionContext::resizeHostCtxBuffer(const size_t ctxSize_) {
-  CeedFree(&hostBuffer);
-  CeedMallocArray(1, ctxSize, &hostBuffer);
-}
-
-void QFunctionContext::setCurrentCtxMemoryIfNeeded() {
-  if (!currentMemory.isInitialized()) {
-    resizeCtxMemory(ctxSize);
-    currentMemory = memory;
-  }
-}
-
-void QFunctionContext::setCurrentHostCtxBufferIfNeeded() {
-  if (!currentHostBuffer) {
-    resizeHostCtxBuffer(ctxSize);
-    currentHostBuffer = hostBuffer;
-  }
-}
-
-void QFunctionContext::freeHostCtxBuffer() {
-  if (hostBuffer) {
-    CeedFree(&hostBuffer);
-  }
-}
-
-int QFunctionContext::hasValidData(bool *has_valid_data) const {
-  (*has_valid_data) = (!!hostBuffer) || (!!currentHostBuffer) || (memory.isInitialized()) || (currentMemory.isInitialized());
-  return CEED_ERROR_SUCCESS;
-}
-
-int QFunctionContext::hasBorrowedDataOfType(CeedMemType mem_type, bool *has_borrowed_data_of_type) const {
-  switch (mem_type) {
-    case CEED_MEM_HOST:
-      (*has_borrowed_data_of_type) = !!currentHostBuffer;
-      break;
-    case CEED_MEM_DEVICE:
-      (*has_borrowed_data_of_type) = currentMemory.isInitialized();
-      break;
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int QFunctionContext::setData(CeedMemType mtype, CeedCopyMode cmode, void *data) {
-  switch (cmode) {
-    case CEED_COPY_VALUES:
-      return copyDataValues(mtype, data);
-    case CEED_OWN_POINTER:
-      return ownDataPointer(mtype, data);
-    case CEED_USE_POINTER:
-      return useDataPointer(mtype, data);
-  }
-  return ceedError("Invalid CeedCopyMode passed");
-}
-
-int QFunctionContext::copyDataValues(CeedMemType mtype, void *data) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      setCurrentHostCtxBufferIfNeeded();
-      std::memcpy(currentHostBuffer, data, ctxSize);
-      syncState = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      setCurrentCtxMemoryIfNeeded();
-      currentMemory.copyFrom(dataToMemory(data));
-      syncState = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int QFunctionContext::ownDataPointer(CeedMemType mtype, void *data) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      freeHostCtxBuffer();
-      hostBuffer = currentHostBuffer = data;
-      syncState                      = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      memory.free();
-      memory = currentMemory = dataToMemory(data);
-      syncState              = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int QFunctionContext::useDataPointer(CeedMemType mtype, void *data) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      freeHostCtxBuffer();
-      currentHostBuffer = data;
-      syncState         = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      memory.free();
-      currentMemory = dataToMemory(data);
-      syncState     = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int QFunctionContext::takeData(CeedMemType mtype, void *data) {
-  if (currentHostBuffer == NULL && currentMemory == ::occa::null) return ceedError("No context data set");
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      setCurrentHostCtxBufferIfNeeded();
-      if (syncState == SyncState::device) {
-        setCurrentCtxMemoryIfNeeded();
-        currentMemory.copyTo(currentHostBuffer);
-      }
-      syncState         = SyncState::host;
-      *(void **)data    = currentHostBuffer;
-      hostBuffer        = NULL;
-      currentHostBuffer = NULL;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      setCurrentCtxMemoryIfNeeded();
-      if (syncState == SyncState::host) {
-        setCurrentHostCtxBufferIfNeeded();
-        currentMemory.copyFrom(currentHostBuffer);
-      }
-      syncState      = SyncState::device;
-      *(void **)data = memoryToData(currentMemory);
-      memory         = ::occa::null;
-      currentMemory  = ::occa::null;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int QFunctionContext::getData(CeedMemType mtype, void *data) {
-  // The passed `data` might be modified before restoring
-  if (currentHostBuffer == NULL && currentMemory == ::occa::null) return ceedError("No context data set");
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      setCurrentHostCtxBufferIfNeeded();
-      if (syncState == SyncState::device) {
-        setCurrentCtxMemoryIfNeeded();
-        currentMemory.copyTo(currentHostBuffer);
-      }
-      syncState      = SyncState::host;
-      *(void **)data = currentHostBuffer;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      setCurrentCtxMemoryIfNeeded();
-      if (syncState == SyncState::host) {
-        setCurrentHostCtxBufferIfNeeded();
-        currentMemory.copyFrom(currentHostBuffer);
-      }
-      syncState      = SyncState::device;
-      *(void **)data = memoryToData(currentMemory);
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int QFunctionContext::restoreData() { return CEED_ERROR_SUCCESS; }
-
-::occa::memory QFunctionContext::getKernelArg() {
-  setCurrentCtxMemoryIfNeeded();
-  if (syncState == SyncState::host) {
-    setCurrentHostCtxBufferIfNeeded();
-    currentMemory.copyFrom(currentHostBuffer);
-  }
-  syncState = SyncState::device;
-  return currentMemory;
-}
-
-//---[ Ceed Callbacks ]-----------
-int QFunctionContext::registerCeedFunction(Ceed ceed, CeedQFunctionContext ctx, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "QFunctionContext", ctx, fname, f);
-}
-
-int QFunctionContext::ceedCreate(CeedQFunctionContext ctx) {
-  Ceed ceed;
-  CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
-
-  CeedOccaRegisterFunction(ctx, "HasValidData", QFunctionContext::ceedHasValidData);
-  CeedOccaRegisterFunction(ctx, "HasBorrowedDataOfType", QFunctionContext::ceedHasBorrowedDataOfType);
-  CeedOccaRegisterFunction(ctx, "SetData", QFunctionContext::ceedSetData);
-  CeedOccaRegisterFunction(ctx, "TakeData", QFunctionContext::ceedTakeData);
-  CeedOccaRegisterFunction(ctx, "GetData", QFunctionContext::ceedGetData);
-  CeedOccaRegisterFunction(ctx, "GetDataRead", QFunctionContext::ceedGetDataRead);
-  CeedOccaRegisterFunction(ctx, "RestoreData", QFunctionContext::ceedRestoreData);
-  CeedOccaRegisterFunction(ctx, "Destroy", QFunctionContext::ceedDestroy);
-
-  QFunctionContext *ctx_ = new QFunctionContext();
-  CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, ctx_));
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int QFunctionContext::ceedHasValidData(const CeedQFunctionContext ctx, bool *has_valid_data) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  return ctx_->hasValidData(has_valid_data);
-}
-
-int QFunctionContext::ceedHasBorrowedDataOfType(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  return ctx_->hasBorrowedDataOfType(mem_type, has_borrowed_data_of_type);
-}
-
-int QFunctionContext::ceedSetData(CeedQFunctionContext ctx, CeedMemType mtype, CeedCopyMode cmode, void *data) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  return ctx_->setData(mtype, cmode, data);
-}
-
-int QFunctionContext::ceedTakeData(CeedQFunctionContext ctx, CeedMemType mtype, void *data) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  return ctx_->takeData(mtype, data);
-}
-
-int QFunctionContext::ceedGetData(CeedQFunctionContext ctx, CeedMemType mtype, void *data) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  return ctx_->getData(mtype, data);
-}
-
-int QFunctionContext::ceedGetDataRead(CeedQFunctionContext ctx, CeedMemType mtype, void *data) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  // Todo: Determine if calling getData is sufficient
-  return ctx_->getData(mtype, data);
-}
-
-int QFunctionContext::ceedRestoreData(CeedQFunctionContext ctx) {
-  QFunctionContext *ctx_ = QFunctionContext::from(ctx);
-  if (!ctx_) {
-    return staticCeedError("Invalid CeedQFunctionContext passed");
-  }
-  return ctx_->restoreData();
-}
-
-int QFunctionContext::ceedDestroy(CeedQFunctionContext ctx) {
-  delete getQFunctionContext(ctx, false);
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-qfunctioncontext.hpp b/backends/occa/ceed-occa-qfunctioncontext.hpp
deleted file mode 100644
index 850eb3adbf..0000000000
--- a/backends/occa/ceed-occa-qfunctioncontext.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_QFUNCTIONCONTEXT_HEADER
-#define CEED_OCCA_QFUNCTIONCONTEXT_HEADER
-
-#include "ceed-occa-ceed-object.hpp"
-
-namespace ceed {
-namespace occa {
-class QFunctionContext : public CeedObject {
- public:
-  // Owned resources
-  size_t         ctxSize;
-  ::occa::memory memory;
-  void          *hostBuffer;
-
-  // Current resources
-  ::occa::memory currentMemory;
-  void          *currentHostBuffer;
-
-  // State information
-  int syncState;
-
-  QFunctionContext();
-
-  ~QFunctionContext();
-
-  static QFunctionContext *getQFunctionContext(CeedQFunctionContext ctx, const bool assertValid = true);
-
-  static QFunctionContext *from(CeedQFunctionContext ctx);
-
-  ::occa::memory dataToMemory(const void *data) {
-    ::occa::memory mem((::occa::modeMemory_t *)data);
-    return mem;
-  }
-
-  void *memoryToData(::occa::memory &memory) { return memory.getModeMemory(); }
-
-  void resizeCtx(const size_t ctxSize_);
-
-  void resizeCtxMemory(const size_t ctxSize_);
-
-  void resizeCtxMemory(::occa::device device, const size_t ctxSize_);
-
-  void resizeHostCtxBuffer(const size_t ctxSize_);
-
-  void setCurrentCtxMemoryIfNeeded();
-
-  void setCurrentHostCtxBufferIfNeeded();
-
-  void freeHostCtxBuffer();
-
-  int hasValidData(bool *has_valid_data) const;
-
-  int hasBorrowedDataOfType(CeedMemType mem_type, bool *has_borrowed_data_of_type) const;
-
-  int setData(CeedMemType mtype, CeedCopyMode cmode, void *data);
-
-  int copyDataValues(CeedMemType mtype, void *data);
-
-  int ownDataPointer(CeedMemType mtype, void *data);
-
-  int useDataPointer(CeedMemType mtype, void *data);
-
-  int takeData(CeedMemType mtype, void *data);
-
-  int getData(CeedMemType mtype, void *data);
-
-  int restoreData();
-
-  ::occa::memory getKernelArg();
-
-  //---[ Ceed Callbacks ]-----------
-  static int registerCeedFunction(Ceed ceed, CeedQFunctionContext ctx, const char *fname, ceed::occa::ceedFunction f);
-
-  static int ceedCreate(CeedQFunctionContext ctx);
-
-  static int ceedHasValidData(const CeedQFunctionContext ctx, bool *has_valid_data);
-
-  static int ceedHasBorrowedDataOfType(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type);
-
-  static int ceedSetData(CeedQFunctionContext ctx, CeedMemType mtype, CeedCopyMode cmode, void *data);
-
-  static int ceedTakeData(CeedQFunctionContext ctx, CeedMemType mtype, void *data);
-
-  static int ceedGetData(CeedQFunctionContext ctx, CeedMemType mtype, void *data);
-
-  static int ceedGetDataRead(CeedQFunctionContext ctx, CeedMemType mtype, void *data);
-
-  static int ceedRestoreData(CeedQFunctionContext ctx);
-
-  static int ceedDestroy(CeedQFunctionContext ctx);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-simplex-basis.cpp b/backends/occa/ceed-occa-simplex-basis.cpp
deleted file mode 100644
index 747d21afd9..0000000000
--- a/backends/occa/ceed-occa-simplex-basis.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-simplex-basis.hpp"
-
-#include "ceed-occa-kernels.hpp"
-
-namespace ceed {
-namespace occa {
-SimplexBasis::SimplexBasis(CeedBasis basis, CeedInt dim_, CeedInt P_, CeedInt Q_, const CeedScalar *interp_, const CeedScalar *grad_,
-                           const CeedScalar *qWeight_) {
-  setCeedFields(basis);
-
-  dim = dim_;
-  P   = P_;
-  Q   = Q_;
-
-  ::occa::device device = getDevice();
-
-  interp  = device.malloc<CeedScalar>(P * Q, interp_);
-  grad    = device.malloc<CeedScalar>(P * Q * dim, grad_);
-  qWeight = device.malloc<CeedScalar>(Q, qWeight_);
-
-  setKernelProperties();
-}
-
-SimplexBasis::~SimplexBasis() {}
-
-bool SimplexBasis::isTensorBasis() const { return false; }
-
-const char *SimplexBasis::getFunctionSource() const {
-  // TODO: Add gpu function sources when split
-  return occa_simplex_basis_cpu_function_source;
-}
-
-void SimplexBasis::setKernelProperties() {
-  kernelProperties["defines/CeedInt"]               = ::occa::dtype::get<CeedInt>().name();
-  kernelProperties["defines/CeedScalar"]            = ::occa::dtype::get<CeedScalar>().name();
-  kernelProperties["defines/DIM"]                   = dim;
-  kernelProperties["defines/Q"]                     = Q;
-  kernelProperties["defines/P"]                     = P;
-  kernelProperties["defines/MAX_PQ"]                = P > Q ? P : Q;
-  kernelProperties["defines/BASIS_COMPONENT_COUNT"] = ceedComponentCount;
-  if (usingGpuDevice()) {
-    kernelProperties["defines/ELEMENTS_PER_BLOCK"] = (Q <= 1024) ? (1024 / Q) : 1;
-  }
-}
-
-::occa::kernel SimplexBasis::buildKernel(const std::string &kernelName) {
-  std::string kernelSource;
-  if (usingGpuDevice()) {
-    kernelSource = occa_simplex_basis_gpu_source;
-  } else {
-    kernelSource = occa_simplex_basis_cpu_function_source;
-    kernelSource += '\n';
-    kernelSource += occa_simplex_basis_cpu_kernel_source;
-  }
-
-  return getDevice().buildKernelFromString(kernelSource, kernelName, kernelProperties);
-}
-
-int SimplexBasis::applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) {
-  if (transpose) {
-    if (!interpTKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"] = transpose;
-      interpTKernel                         = buildKernel("interp");
-    }
-
-    interpTKernel(elementCount, interp, U.getConstKernelArg(), V.getKernelArg());
-  } else {
-    if (!interpKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"] = transpose;
-      interpKernel                          = buildKernel("interp");
-    }
-
-    interpKernel(elementCount, interp, U.getConstKernelArg(), V.getKernelArg());
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int SimplexBasis::applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) {
-  if (transpose) {
-    if (!gradTKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"] = transpose;
-      gradTKernel                           = buildKernel("grad");
-    }
-
-    gradTKernel(elementCount, grad, U.getConstKernelArg(), V.getKernelArg());
-  } else {
-    if (!gradKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"] = transpose;
-      gradKernel                            = buildKernel("grad");
-    }
-
-    gradKernel(elementCount, grad, U.getConstKernelArg(), V.getKernelArg());
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int SimplexBasis::applyWeight(const CeedInt elementCount, Vector &W) {
-  if (!weightKernel.isInitialized()) {
-    weightKernel = buildKernel("weight");
-  }
-  weightKernel(elementCount, qWeight, W.getKernelArg());
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int SimplexBasis::apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *U, Vector *V) {
-  const bool transpose = tmode == CEED_TRANSPOSE;
-
-  if ((dim < 1) || (3 < dim)) {
-    return ceedError("Backend only supports dimensions: 1, 2, and 3");
-  }
-
-  // Check arguments
-  if (emode != CEED_EVAL_WEIGHT) {
-    if (!U) {
-      return ceedError("Incorrect CeedVector input: U");
-    }
-  }
-  if (!V) {
-    return ceedError("Incorrect CeedVector input: V");
-  }
-
-  try {
-    // Apply kernel
-    switch (emode) {
-      case CEED_EVAL_INTERP:
-        return applyInterp(elementCount, transpose, *U, *V);
-      case CEED_EVAL_GRAD:
-        return applyGrad(elementCount, transpose, *U, *V);
-      case CEED_EVAL_WEIGHT:
-        return applyWeight(elementCount, *V);
-      default:
-        return ceedError("Backend does not support given simplex eval mode");
-    }
-  } catch (::occa::exception &exc) {
-    // Handle kernel build errors the CEED way
-    CeedHandleOccaException(exc);
-  }
-
-  return CEED_ERROR_SUCCESS;
-}
-
-//---[ Ceed Callbacks ]-------------
-int SimplexBasis::ceedCreate(CeedElemTopology topology, CeedInt dim, CeedInt ndof, CeedInt nquad, const CeedScalar *interp, const CeedScalar *grad,
-                             const CeedScalar *qref, const CeedScalar *qWeight, CeedBasis basis) {
-  Ceed ceed;
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
-
-  SimplexBasis *basis_ = new SimplexBasis(basis, dim, ndof, nquad, interp, grad, qWeight);
-  CeedCallBackend(CeedBasisSetData(basis, basis_));
-
-  CeedOccaRegisterFunction(basis, "Apply", Basis::ceedApply);
-  CeedOccaRegisterFunction(basis, "Destroy", Basis::ceedDestroy);
-
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-simplex-basis.hpp b/backends/occa/ceed-occa-simplex-basis.hpp
deleted file mode 100644
index c27b6d0a88..0000000000
--- a/backends/occa/ceed-occa-simplex-basis.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_SIMPLEXBASIS_HEADER
-#define CEED_OCCA_SIMPLEXBASIS_HEADER
-
-#include "ceed-occa-basis.hpp"
-
-namespace ceed {
-namespace occa {
-class SimplexBasis : public Basis {
- public:
-  ::occa::memory interp;
-  ::occa::memory grad;
-  ::occa::memory qWeight;
-
-  ::occa::json   kernelProperties;
-  ::occa::kernel interpKernel;
-  ::occa::kernel interpTKernel;
-  ::occa::kernel gradKernel;
-  ::occa::kernel gradTKernel;
-  ::occa::kernel weightKernel;
-
-  SimplexBasis(CeedBasis basis, CeedInt dim, CeedInt P_, CeedInt Q_, const CeedScalar *interp_, const CeedScalar *grad_, const CeedScalar *qWeight_);
-
-  ~SimplexBasis();
-
-  bool isTensorBasis() const;
-
-  const char *getFunctionSource() const;
-
-  void setKernelProperties();
-
-  std::string getKernelSource() const;
-
-  ::occa::kernel buildKernel(const std::string &kernelName);
-
-  int applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V);
-
-  int applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V);
-
-  int applyWeight(const CeedInt elementCount, Vector &W);
-
-  int apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *u, Vector *v);
-
-  //---[ Ceed Callbacks ]-----------
-  static int ceedCreate(CeedElemTopology topology, CeedInt dim, CeedInt ndof, CeedInt nquad, const CeedScalar *interp, const CeedScalar *grad,
-                        const CeedScalar *qref, const CeedScalar *qWeight, CeedBasis basis);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-tensor-basis.cpp b/backends/occa/ceed-occa-tensor-basis.cpp
deleted file mode 100644
index 553672170c..0000000000
--- a/backends/occa/ceed-occa-tensor-basis.cpp
+++ /dev/null
@@ -1,236 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-tensor-basis.hpp"
-
-#include "ceed-occa-kernels.hpp"
-
-namespace ceed {
-namespace occa {
-TensorBasis::TensorBasis(CeedBasis basis, CeedInt dim_, CeedInt P1D_, CeedInt Q1D_, const CeedScalar *interp1D_, const CeedScalar *grad1D_,
-                         const CeedScalar *qWeight1D_)
-    : P1D(P1D_), Q1D(Q1D_) {
-  setCeedFields(basis);
-
-  dim = dim_;
-
-  P = P1D;
-  Q = Q1D;
-  for (int i = 1; i < dim; ++i) {
-    P *= P1D;
-    Q *= Q1D;
-  }
-
-  ::occa::device device = getDevice();
-
-  interp1D  = device.malloc<CeedScalar>(P1D * Q1D, interp1D_);
-  grad1D    = device.malloc<CeedScalar>(P1D * Q1D, grad1D_);
-  qWeight1D = device.malloc<CeedScalar>(Q1D, qWeight1D_);
-
-  setKernelProperties();
-}
-
-TensorBasis::~TensorBasis() {}
-
-bool TensorBasis::isTensorBasis() const { return true; }
-
-void TensorBasis::setKernelProperties() {
-  kernelProperties["defines/CeedInt"]               = ::occa::dtype::get<CeedInt>().name();
-  kernelProperties["defines/CeedScalar"]            = ::occa::dtype::get<CeedScalar>().name();
-  kernelProperties["defines/Q1D"]                   = Q1D;
-  kernelProperties["defines/P1D"]                   = P1D;
-  kernelProperties["defines/BASIS_COMPONENT_COUNT"] = ceedComponentCount;
-  if (usingGpuDevice()) {
-    kernelProperties["defines/MAX_PQ"] = (Q1D > P1D) ? Q1D : P1D;
-  }
-}
-
-const char *TensorBasis::getFunctionSource() const {
-  // TODO: Add gpu function sources when split
-  const char *cpuFunctionSources[3] = {occa_tensor_basis_1d_cpu_function_source, occa_tensor_basis_2d_cpu_function_source,
-                                       occa_tensor_basis_3d_cpu_function_source};
-  return cpuFunctionSources[dim - 1];
-}
-
-std::string TensorBasis::getKernelSource() const {
-  const char *cpuFunctionSources[3] = {occa_tensor_basis_1d_cpu_function_source, occa_tensor_basis_2d_cpu_function_source,
-                                       occa_tensor_basis_3d_cpu_function_source};
-  const char *cpuKernelSources[3]   = {occa_tensor_basis_1d_cpu_kernel_source, occa_tensor_basis_2d_cpu_kernel_source,
-                                       occa_tensor_basis_3d_cpu_kernel_source};
-  const char *gpuKernelSources[3]   = {occa_tensor_basis_1d_gpu_source, occa_tensor_basis_2d_gpu_source, occa_tensor_basis_3d_gpu_source};
-
-  std::string kernelSource;
-  if (usingGpuDevice()) {
-    kernelSource = gpuKernelSources[dim - 1];
-  } else {
-    kernelSource = cpuFunctionSources[dim - 1];
-    kernelSource += '\n';
-    kernelSource += cpuKernelSources[dim - 1];
-  }
-  return kernelSource;
-}
-
-::occa::kernel TensorBasis::buildKernel(const std::string &kernelName) {
-  std::string kernelSource = getKernelSource();
-  return getDevice().buildKernelFromString(kernelSource, kernelName, kernelProperties);
-}
-
-int TensorBasis::applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) {
-  if (transpose) {
-    if (!interpTKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"]          = transpose;
-      kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockInterp();
-      interpTKernel                                  = buildKernel("interp");
-    }
-    interpTKernel(elementCount, interp1D, U.getConstKernelArg(), V.getKernelArg());
-  } else {
-    if (!interpKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"]          = transpose;
-      kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockInterp();
-      interpKernel                                   = buildKernel("interp");
-    }
-    interpKernel(elementCount, interp1D, U.getConstKernelArg(), V.getKernelArg());
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int TensorBasis::elementsPerBlockInterp() const {
-  int elementsPerBlock;
-  if (dim == 1) {
-    elementsPerBlock = 32;
-  } else if (dim == 2) {
-    const CeedInt blocksByQ[7] = {0, 32, 8, 6, 4, 2, 8};
-    if (Q1D < 7) {
-      elementsPerBlock = blocksByQ[Q1D];
-    } else {
-      elementsPerBlock = 1;
-    }
-  } else {
-    elementsPerBlock = 1;
-  }
-  return elementsPerBlock;
-}
-
-int TensorBasis::applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) {
-  if (transpose) {
-    if (!gradTKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"]          = transpose;
-      kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockGrad();
-      gradTKernel                                    = buildKernel("grad");
-    }
-    gradTKernel(elementCount, interp1D, grad1D, U.getConstKernelArg(), V.getKernelArg());
-  } else {
-    if (!gradKernel.isInitialized()) {
-      kernelProperties["defines/TRANSPOSE"]          = transpose;
-      kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockGrad();
-      gradKernel                                     = buildKernel("grad");
-    }
-    gradKernel(elementCount, interp1D, grad1D, U.getConstKernelArg(), V.getKernelArg());
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int TensorBasis::elementsPerBlockGrad() const {
-  int elementsPerBlock;
-  if (dim == 1) {
-    elementsPerBlock = 32;
-  } else if (dim == 2) {
-    const CeedInt blocksByQ[7] = {0, 32, 8, 6, 4, 2, 8};
-    if (Q1D < 7) {
-      elementsPerBlock = blocksByQ[Q1D];
-    } else {
-      elementsPerBlock = 1;
-    }
-  } else {
-    elementsPerBlock = 1;
-  }
-  return elementsPerBlock;
-}
-
-int TensorBasis::applyWeight(const CeedInt elementCount, Vector &W) {
-  if (!weightKernel.isInitialized()) {
-    kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockWeight();
-    weightKernel                                   = buildKernel("weight");
-  }
-  weightKernel(elementCount, qWeight1D, W.getKernelArg());
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int TensorBasis::elementsPerBlockWeight() const {
-  int elementsPerBlock;
-  if (dim == 1) {
-    elementsPerBlock = 32 / Q1D;
-  } else if (dim == 2) {
-    if ((Q1D * Q1D) > 32) {
-      elementsPerBlock = 1;
-    } else {
-      elementsPerBlock = 32 / (Q1D * Q1D);
-    }
-  } else {
-    elementsPerBlock = Q1D;
-  }
-  return elementsPerBlock;
-}
-
-int TensorBasis::apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *U, Vector *V) {
-  const bool transpose = tmode == CEED_TRANSPOSE;
-
-  if ((dim < 1) || (3 < dim)) {
-    return ceedError("Backend only supports dimensions: 1, 2, and 3");
-  }
-
-  // Check arguments
-  if (emode != CEED_EVAL_WEIGHT) {
-    if (!U) {
-      return ceedError("Incorrect CeedVector input: U");
-    }
-  }
-  if (!V) {
-    return ceedError("Incorrect CeedVector input: V");
-  }
-
-  try {
-    // Apply kernel
-    switch (emode) {
-      case CEED_EVAL_INTERP:
-        return applyInterp(elementCount, transpose, *U, *V);
-      case CEED_EVAL_GRAD:
-        return applyGrad(elementCount, transpose, *U, *V);
-      case CEED_EVAL_WEIGHT:
-        return applyWeight(elementCount, *V);
-      default:
-        return ceedError("Backend does not support given tensor eval mode");
-    }
-  } catch (::occa::exception &exc) {
-    // Handle kernel build errors the CEED way
-    CeedHandleOccaException(exc);
-  }
-
-  return CEED_ERROR_SUCCESS;
-}
-
-//---[ Ceed Callbacks ]-------------
-int TensorBasis::ceedCreate(CeedInt dim, CeedInt P1D, CeedInt Q1D, const CeedScalar *interp1D, const CeedScalar *grad1D, const CeedScalar *qref1D,
-                            const CeedScalar *qWeight1D, CeedBasis basis) {
-  Ceed ceed;
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
-
-  if (Q1D < P1D && Context::from(ceed)->usingGpuDevice()) {
-    return staticCeedError("(OCCA) Backend does not implement underintegrated basis");
-  }
-
-  TensorBasis *basis_ = new TensorBasis(basis, dim, P1D, Q1D, interp1D, grad1D, qWeight1D);
-  CeedCallBackend(CeedBasisSetData(basis, basis_));
-
-  CeedOccaRegisterFunction(basis, "Apply", Basis::ceedApply);
-  CeedOccaRegisterFunction(basis, "Destroy", Basis::ceedDestroy);
-
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-tensor-basis.hpp b/backends/occa/ceed-occa-tensor-basis.hpp
deleted file mode 100644
index 35e345b8c9..0000000000
--- a/backends/occa/ceed-occa-tensor-basis.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_TENSORBASIS_HEADER
-#define CEED_OCCA_TENSORBASIS_HEADER
-
-#include "ceed-occa-basis.hpp"
-
-namespace ceed {
-namespace occa {
-class TensorBasis : public Basis {
- public:
-  CeedInt        P1D;
-  CeedInt        Q1D;
-  ::occa::memory interp1D;
-  ::occa::memory grad1D;
-  ::occa::memory qWeight1D;
-
-  ::occa::json   kernelProperties;
-  ::occa::kernel interpKernel;
-  ::occa::kernel interpTKernel;
-  ::occa::kernel gradKernel;
-  ::occa::kernel gradTKernel;
-  ::occa::kernel weightKernel;
-
-  TensorBasis(CeedBasis basis, CeedInt dim_, CeedInt P1D_, CeedInt Q1D_, const CeedScalar *interp1D_, const CeedScalar *grad1D_,
-              const CeedScalar *qWeight1D_);
-
-  ~TensorBasis();
-
-  bool isTensorBasis() const;
-
-  const char *getFunctionSource() const;
-
-  std::string getKernelSource() const;
-
-  void setKernelProperties();
-
-  int elementsPerBlockInterp() const;
-  int elementsPerBlockGrad() const;
-  int elementsPerBlockWeight() const;
-
-  ::occa::kernel buildKernel(const std::string &kernelName);
-
-  int applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V);
-
-  int applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V);
-
-  int applyWeight(const CeedInt elementCount, Vector &W);
-
-  int apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *U, Vector *V);
-
-  //---[ Ceed Callbacks ]-----------
-  static int ceedCreate(CeedInt dim, CeedInt P1D, CeedInt Q1D, const CeedScalar *interp1D, const CeedScalar *grad1D, const CeedScalar *qref1D,
-                        const CeedScalar *qWeight1D, CeedBasis basis);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-types.hpp b/backends/occa/ceed-occa-types.hpp
deleted file mode 100644
index cc56791f85..0000000000
--- a/backends/occa/ceed-occa-types.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_TYPES_HEADER
-#define CEED_OCCA_TYPES_HEADER
-
-#include <ceed/backend.h>
-
-#include <occa.hpp>
-
-#define CeedOccaFromChk(ierr) \
-  do {                        \
-    if (ierr) {               \
-      return NULL;            \
-    }                         \
-  } while (0)
-
-#define CeedCallOcca(...)      \
-  do {                         \
-    int ierr_q_ = __VA_ARGS__; \
-    CeedOccaFromChk(ierr_q_);  \
-  } while (0);
-
-#define CeedOccaValidChk(isValidVar, ierr) \
-  do {                                     \
-    if (ierr) {                            \
-      isValidVar = false;                  \
-      return;                              \
-    }                                      \
-  } while (0)
-
-#define CeedCallOccaValid(isValidVar, ...) \
-  do {                                     \
-    int ierr_q_ = __VA_ARGS__;             \
-    CeedOccaValidChk(isValidVar, ierr_q_); \
-  } while (0);
-
-#define CeedHandleOccaException(exc)                           \
-  do {                                                         \
-    std::string error = exc.toString();                        \
-    return CeedError(ceed, CEED_ERROR_BACKEND, error.c_str()); \
-  } while (0)
-
-#define CeedOccaCastRegisterFunction(func) (ceed::occa::ceedFunction)(void *) func
-
-#define CeedOccaRegisterBaseFunction(name, func) CeedCallBackend(registerCeedFunction(ceed, name, CeedOccaCastRegisterFunction(func)));
-
-#define CeedOccaRegisterFunction(object, name, func) CeedCallBackend(registerCeedFunction(ceed, object, name, CeedOccaCastRegisterFunction(func)));
-
-namespace ceed {
-namespace occa {
-typedef int (*ceedFunction)();
-}
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa-vector.cpp b/backends/occa/ceed-occa-vector.cpp
deleted file mode 100644
index 0a5c51a28a..0000000000
--- a/backends/occa/ceed-occa-vector.cpp
+++ /dev/null
@@ -1,460 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "ceed-occa-vector.hpp"
-
-#include <cstring>
-
-#include "ceed-occa-kernels.hpp"
-
-namespace ceed {
-namespace occa {
-Vector::Vector() : length(0), hostBufferLength(0), hostBuffer(NULL), currentHostBuffer(NULL), syncState(SyncState::none) {}
-
-Vector::~Vector() {
-  memory.free();
-  freeHostBuffer();
-}
-
-int Vector::hasValidArray(bool *has_valid_array) {
-  (*has_valid_array) = (!!hostBuffer) || (!!currentHostBuffer) || (memory.isInitialized()) || (currentMemory.isInitialized());
-  return CEED_ERROR_SUCCESS;
-}
-
-int Vector::hasBorrowedArrayOfType(CeedMemType mem_type, bool *has_borrowed_array_of_type) {
-  switch (mem_type) {
-    case CEED_MEM_HOST:
-      (*has_borrowed_array_of_type) = !!currentHostBuffer;
-      break;
-    case CEED_MEM_DEVICE:
-      (*has_borrowed_array_of_type) = currentMemory.isInitialized();
-      break;
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-Vector *Vector::getVector(CeedVector vec, const bool assertValid) {
-  if (!vec || vec == CEED_VECTOR_NONE) {
-    return NULL;
-  }
-
-  int     ierr;
-  Vector *vector = NULL;
-
-  ierr = CeedVectorGetData(vec, &vector);
-  if (assertValid) {
-    CeedOccaFromChk(ierr);
-  }
-
-  return vector;
-}
-
-Vector *Vector::from(CeedVector vec) {
-  Vector *vector = getVector(vec);
-  if (!vector) {
-    return NULL;
-  }
-
-  CeedCallOcca(CeedVectorGetCeed(vec, &vector->ceed));
-  CeedCallOcca(CeedVectorGetLength(vec, &vector->length));
-
-  return vector;
-}
-
-void Vector::resize(const CeedSize length_) { length = length_; }
-
-void Vector::resizeMemory(const CeedSize length_) { resizeMemory(getDevice(), length_); }
-
-void Vector::resizeMemory(::occa::device device, const CeedSize length_) {
-  if (length_ != (CeedSize)memory.length()) {
-    memory.free();
-    memory = device.malloc<CeedScalar>(length_);
-  }
-}
-
-void Vector::resizeHostBuffer(const CeedSize length_) {
-  if (length_ != hostBufferLength) {
-    delete hostBuffer;
-    hostBuffer = new CeedScalar[length_];
-  }
-}
-
-void Vector::setCurrentMemoryIfNeeded() {
-  if (!currentMemory.isInitialized()) {
-    resizeMemory(length);
-    currentMemory = memory;
-  }
-}
-
-void Vector::setCurrentHostBufferIfNeeded() {
-  if (!currentHostBuffer) {
-    resizeHostBuffer(length);
-    currentHostBuffer = hostBuffer;
-  }
-}
-
-void Vector::freeHostBuffer() {
-  if (hostBuffer) {
-    delete[] hostBuffer;
-    hostBuffer = NULL;
-  }
-}
-
-int Vector::setValue(CeedScalar value) {
-  // Prioritize keeping data in the device
-  if (syncState & SyncState::device) {
-    setCurrentMemoryIfNeeded();
-    if (!setValueKernel.isInitialized()) {
-      ::occa::json kernelProperties;
-      CeedInt constexpr block_size{256};
-      kernelProperties["defines/CeedInt"]    = ::occa::dtype::get<CeedInt>().name();
-      kernelProperties["defines/CeedScalar"] = ::occa::dtype::get<CeedScalar>().name();
-      kernelProperties["defines/BLOCK_SIZE"] = block_size;
-
-      std::string kernelSource = occa_set_value_source;
-      setValueKernel           = getDevice().buildKernelFromString(kernelSource, "setValue", kernelProperties);
-      setValueKernel(currentMemory, value, length);
-    }
-    syncState = SyncState::device;
-  } else {
-    setCurrentHostBufferIfNeeded();
-    for (CeedInt i = 0; i < length; ++i) {
-      currentHostBuffer[i] = value;
-    }
-    syncState = SyncState::host;
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-int Vector::setArray(CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array) {
-  switch (cmode) {
-    case CEED_COPY_VALUES:
-      return copyArrayValues(mtype, array);
-    case CEED_OWN_POINTER:
-      return ownArrayPointer(mtype, array);
-    case CEED_USE_POINTER:
-      return useArrayPointer(mtype, array);
-  }
-  return ceedError("Invalid CeedCopyMode passed");
-}
-
-int Vector::takeArray(CeedMemType mtype, CeedScalar **array) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      setCurrentHostBufferIfNeeded();
-      if (syncState == SyncState::device) {
-        setCurrentMemoryIfNeeded();
-        currentMemory.copyTo(currentHostBuffer);
-      }
-      *array            = currentHostBuffer;
-      hostBuffer        = NULL;
-      currentHostBuffer = NULL;
-
-      syncState = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      setCurrentMemoryIfNeeded();
-      if (syncState == SyncState::host) {
-        setCurrentHostBufferIfNeeded();
-        currentMemory.copyFrom(currentHostBuffer);
-      }
-      *array        = memoryToArray<CeedScalar>(currentMemory);
-      memory        = ::occa::null;
-      currentMemory = ::occa::null;
-
-      syncState = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int Vector::copyArrayValues(CeedMemType mtype, CeedScalar *array) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      setCurrentHostBufferIfNeeded();
-      if (array) {
-        std::memcpy(currentHostBuffer, array, length * sizeof(CeedScalar));
-      }
-      syncState = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      setCurrentMemoryIfNeeded();
-      if (array) {
-        currentMemory.copyFrom(arrayToMemory(array));
-      }
-      syncState = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int Vector::ownArrayPointer(CeedMemType mtype, CeedScalar *array) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      freeHostBuffer();
-      hostBuffer = currentHostBuffer = array;
-      syncState                      = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      memory.free();
-      memory = currentMemory = arrayToMemory(array);
-      syncState              = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int Vector::useArrayPointer(CeedMemType mtype, CeedScalar *array) {
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      freeHostBuffer();
-      currentHostBuffer = array;
-      syncState         = SyncState::host;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      memory.free();
-      currentMemory = arrayToMemory(array);
-      syncState     = SyncState::device;
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int Vector::getArray(CeedMemType mtype, CeedScalar **array) {
-  // The passed `array` might be modified before restoring
-  // so we can't set sync state to SyncState::all
-  switch (mtype) {
-    case CEED_MEM_HOST:
-      setCurrentHostBufferIfNeeded();
-      if (syncState == SyncState::device) {
-        setCurrentMemoryIfNeeded();
-        currentMemory.copyTo(currentHostBuffer);
-      }
-      syncState = SyncState::host;
-      *array    = currentHostBuffer;
-      return CEED_ERROR_SUCCESS;
-    case CEED_MEM_DEVICE:
-      setCurrentMemoryIfNeeded();
-      if (syncState == SyncState::host) {
-        setCurrentHostBufferIfNeeded();
-        currentMemory.copyFrom(currentHostBuffer);
-      }
-      syncState = SyncState::device;
-      *array    = memoryToArray<CeedScalar>(currentMemory);
-      return CEED_ERROR_SUCCESS;
-  }
-  return ceedError("Invalid CeedMemType passed");
-}
-
-int Vector::getReadOnlyArray(CeedMemType mtype, CeedScalar **array) {
-  const bool willBeFullySynced =
-      ((syncState == SyncState::device && mtype == CEED_MEM_HOST) || (syncState == SyncState::host && mtype == CEED_MEM_DEVICE));
-
-  const int error = getArray(mtype, const_cast<CeedScalar **>(array));
-  // Take advantage the vector will be fully synced
-  if (!error && willBeFullySynced) {
-    syncState = SyncState::all;
-  }
-
-  return error;
-}
-
-int Vector::getWriteOnlyArray(CeedMemType mtype, CeedScalar **array) {
-  // const bool willBeFullySynced = (
-  //   (syncState == SyncState::device && mtype == CEED_MEM_HOST) ||
-  //   (syncState == SyncState::host && mtype == CEED_MEM_DEVICE)
-  // );
-
-  const int error = getArray(mtype, const_cast<CeedScalar **>(array));
-  // // Take advantage the vector will be fully synced
-  // if (!error && willBeFullySynced) {
-  //   syncState = SyncState::all;
-  // }
-
-  return error;
-}
-
-int Vector::restoreArray(CeedScalar **array) { return CEED_ERROR_SUCCESS; }
-
-int Vector::restoreReadOnlyArray(CeedScalar **array) { return CEED_ERROR_SUCCESS; }
-
-::occa::memory Vector::getKernelArg() {
-  setCurrentMemoryIfNeeded();
-  if (syncState == SyncState::host) {
-    setCurrentHostBufferIfNeeded();
-    currentMemory.copyFrom(currentHostBuffer);
-  }
-  syncState = SyncState::device;
-  return currentMemory;
-}
-
-::occa::memory Vector::getConstKernelArg() {
-  setCurrentMemoryIfNeeded();
-  if (syncState == SyncState::host) {
-    setCurrentHostBufferIfNeeded();
-    currentMemory.copyFrom(currentHostBuffer);
-    syncState = SyncState::all;
-  }
-  return currentMemory;
-}
-
-void Vector::printValues(const std::string &name) {
-  CeedScalar *values;
-  getReadOnlyArray(CEED_MEM_HOST, &values);
-
-  std::cout << std::setprecision(8) << "Vector: " << name << std::endl << "  - Values: " << std::endl;
-
-  for (int i = 0; i < length; ++i) {
-    printf("    %12.8f\n", values[i]);
-  }
-}
-
-void Vector::printNonZeroValues(const std::string &name) {
-  CeedScalar *values;
-  getReadOnlyArray(CEED_MEM_HOST, &values);
-
-  std::cout << std::setprecision(8) << "Vector: " << name << std::endl << "  - Non-zero values: " << std::endl;
-
-  for (int i = 0; i < length; ++i) {
-    if (fabs(values[i]) > 1e-8) {
-      printf("    %d: %12.8f\n", i, values[i]);
-    }
-  }
-}
-
-void Vector::printSummary(const std::string &name) {
-  CeedScalar *values;
-  getReadOnlyArray(CEED_MEM_HOST, &values);
-
-  CeedScalar minValue = values[0];
-  CeedScalar maxValue = values[0];
-
-  for (int i = 0; i < length; ++i) {
-    const CeedScalar value = values[i];
-    minValue               = minValue < value ? minValue : value;
-    maxValue               = maxValue > value ? maxValue : value;
-  }
-
-  std::cout << std::setprecision(8) << "Vector: " << name << std::endl
-            << "  - Length: " << length << std::endl
-            << "  - Min   : " << minValue << std::endl
-            << "  - Max   : " << maxValue << std::endl;
-}
-
-//---[ Ceed Callbacks ]-----------
-int Vector::registerCeedFunction(Ceed ceed, CeedVector vec, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "Vector", vec, fname, f);
-}
-
-int Vector::ceedCreate(CeedSize length, CeedVector vec) {
-  Ceed ceed;
-  CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
-
-  CeedOccaRegisterFunction(vec, "HasValidArray", Vector::ceedHasValidArray);
-  CeedOccaRegisterFunction(vec, "HasBorrowedArrayOfType", Vector::ceedHasBorrowedArrayOfType);
-  CeedOccaRegisterFunction(vec, "SetValue", Vector::ceedSetValue);
-  CeedOccaRegisterFunction(vec, "SetArray", Vector::ceedSetArray);
-  CeedOccaRegisterFunction(vec, "TakeArray", Vector::ceedTakeArray);
-  CeedOccaRegisterFunction(vec, "GetArray", Vector::ceedGetArray);
-  CeedOccaRegisterFunction(vec, "GetArrayRead", Vector::ceedGetArrayRead);
-  CeedOccaRegisterFunction(vec, "GetArrayWrite", Vector::ceedGetArrayWrite);
-  CeedOccaRegisterFunction(vec, "RestoreArray", Vector::ceedRestoreArray);
-  CeedOccaRegisterFunction(vec, "RestoreArrayRead", Vector::ceedRestoreArrayRead);
-  CeedOccaRegisterFunction(vec, "Destroy", Vector::ceedDestroy);
-
-  Vector *vector = new Vector();
-  CeedCallBackend(CeedVectorSetData(vec, vector));
-
-  return CEED_ERROR_SUCCESS;
-}
-
-int Vector::ceedHasValidArray(CeedVector vec, bool *has_valid_array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->hasValidArray(has_valid_array);
-}
-
-int Vector::ceedHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->hasBorrowedArrayOfType(mem_type, has_borrowed_array_of_type);
-}
-
-int Vector::ceedSetValue(CeedVector vec, CeedScalar value) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->setValue(value);
-}
-
-int Vector::ceedSetArray(CeedVector vec, CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->setArray(mtype, cmode, array);
-}
-
-int Vector::ceedTakeArray(CeedVector vec, CeedMemType mtype, CeedScalar **array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->takeArray(mtype, array);
-}
-
-int Vector::ceedGetArray(CeedVector vec, CeedMemType mtype, CeedScalar **array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->getArray(mtype, array);
-}
-
-int Vector::ceedGetArrayRead(CeedVector vec, CeedMemType mtype, CeedScalar **array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->getReadOnlyArray(mtype, array);
-}
-
-int Vector::ceedGetArrayWrite(CeedVector vec, CeedMemType mtype, CeedScalar **array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->getWriteOnlyArray(mtype, array);
-}
-
-int Vector::ceedRestoreArray(CeedVector vec, CeedScalar **array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->restoreArray(array);
-}
-
-int Vector::ceedRestoreArrayRead(CeedVector vec, CeedScalar **array) {
-  Vector *vector = Vector::from(vec);
-  if (!vector) {
-    return staticCeedError("Invalid CeedVector passed");
-  }
-  return vector->restoreReadOnlyArray(array);
-}
-
-int Vector::ceedDestroy(CeedVector vec) {
-  delete getVector(vec, false);
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
diff --git a/backends/occa/ceed-occa-vector.hpp b/backends/occa/ceed-occa-vector.hpp
deleted file mode 100644
index 37abf5d7fa..0000000000
--- a/backends/occa/ceed-occa-vector.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_VECTOR_HEADER
-#define CEED_OCCA_VECTOR_HEADER
-
-#include "ceed-occa-ceed-object.hpp"
-
-namespace ceed {
-namespace occa {
-template <class TM>
-::occa::memory arrayToMemory(const TM *array) {
-  if (array) {
-    ::occa::memory mem((::occa::modeMemory_t *)array);
-    mem.setDtype(::occa::dtype::get<TM>());
-    return mem;
-  }
-  return ::occa::null;
-}
-
-template <class TM>
-TM *memoryToArray(::occa::memory &memory) {
-  return (TM *)memory.getModeMemory();
-}
-
-class Vector : public CeedObject {
- public:
-  // Owned resources
-  CeedSize       length;
-  ::occa::memory memory;
-  CeedSize       hostBufferLength;
-  CeedScalar    *hostBuffer;
-
-  ::occa::kernel setValueKernel;
-
-  // Current resources
-  ::occa::memory currentMemory;
-  CeedScalar    *currentHostBuffer;
-
-  // State information
-  int syncState;
-
-  Vector();
-
-  ~Vector();
-
-  int hasValidArray(bool *has_valid_array);
-
-  int hasBorrowedArrayOfType(CeedMemType mem_type, bool *has_borrowed_array_of_type);
-
-  static Vector *getVector(CeedVector vec, const bool assertValid = true);
-
-  static Vector *from(CeedVector vec);
-
-  void resize(const CeedSize length_);
-
-  void resizeMemory(const CeedSize length_);
-
-  void resizeMemory(::occa::device device, const CeedSize length_);
-
-  void resizeHostBuffer(const CeedSize length_);
-
-  void setCurrentMemoryIfNeeded();
-
-  void setCurrentHostBufferIfNeeded();
-
-  void freeHostBuffer();
-
-  int setValue(CeedScalar value);
-
-  int setArray(CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array);
-
-  int takeArray(CeedMemType mtype, CeedScalar **array);
-
-  int copyArrayValues(CeedMemType mtype, CeedScalar *array);
-
-  int ownArrayPointer(CeedMemType mtype, CeedScalar *array);
-
-  int useArrayPointer(CeedMemType mtype, CeedScalar *array);
-
-  int getArray(CeedMemType mtype, CeedScalar **array);
-
-  int getReadOnlyArray(CeedMemType mtype, CeedScalar **array);
-
-  int getWriteOnlyArray(CeedMemType mtype, CeedScalar **array);
-
-  int restoreArray(CeedScalar **array);
-
-  int restoreReadOnlyArray(CeedScalar **array);
-
-  ::occa::memory getKernelArg();
-
-  ::occa::memory getConstKernelArg();
-
-  void printValues(const std::string &name);
-  void printNonZeroValues(const std::string &name);
-  void printSummary(const std::string &name);
-
-  //---[ Ceed Callbacks ]-----------
-  static int registerCeedFunction(Ceed ceed, CeedVector vec, const char *fname, ceed::occa::ceedFunction f);
-
-  static int ceedHasValidArray(CeedVector vec, bool *has_valid_array);
-
-  static int ceedHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type);
-
-  static int ceedCreate(CeedSize length, CeedVector vec);
-
-  static int ceedSetValue(CeedVector vec, CeedScalar value);
-
-  static int ceedSetArray(CeedVector vec, CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array);
-
-  static int ceedTakeArray(CeedVector vec, CeedMemType mtype, CeedScalar **array);
-
-  static int ceedGetArray(CeedVector vec, CeedMemType mtype, CeedScalar **array);
-
-  static int ceedGetArrayRead(CeedVector vec, CeedMemType mtype, CeedScalar **array);
-
-  static int ceedGetArrayWrite(CeedVector vec, CeedMemType mtype, CeedScalar **array);
-
-  static int ceedRestoreArray(CeedVector vec, CeedScalar **array);
-
-  static int ceedRestoreArrayRead(CeedVector vec, CeedScalar **array);
-
-  static int ceedDestroy(CeedVector vec);
-};
-}  // namespace occa
-}  // namespace ceed
-
-#endif
diff --git a/backends/occa/ceed-occa.cpp b/backends/occa/ceed-occa.cpp
deleted file mode 100644
index d43231f2a1..0000000000
--- a/backends/occa/ceed-occa.cpp
+++ /dev/null
@@ -1,329 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#warning "libCEED OCCA backend is experimental; for best performance, use device native backends"
-
-#include <map>
-#include <occa.hpp>
-#include <vector>
-
-#include "ceed-occa-context.hpp"
-#include "ceed-occa-elem-restriction.hpp"
-#include "ceed-occa-operator.hpp"
-#include "ceed-occa-qfunction.hpp"
-#include "ceed-occa-qfunctioncontext.hpp"
-#include "ceed-occa-simplex-basis.hpp"
-#include "ceed-occa-tensor-basis.hpp"
-#include "ceed-occa-types.hpp"
-#include "ceed-occa-vector.hpp"
-
-namespace ceed {
-namespace occa {
-typedef std::map<std::string, std::string> StringMap;
-typedef std::vector<std::string>           StringVector;
-
-enum ResourceParserStep { RESOURCE, QUERY_KEY, QUERY_VALUE };
-
-static const char RESOURCE_DELIMITER        = '/';
-static const char QUERY_DELIMITER           = ':';
-static const char QUERY_KEY_VALUE_DELIMITER = '=';
-static const char QUERY_ARG_DELIMITER       = ',';
-
-static std::string getDefaultDeviceMode(const bool cpuMode, const bool gpuMode) {
-  // In case both cpuMode and gpuMode are set, prioritize the GPU if available
-  // For example, if the resource is "/*/occa"
-  if (gpuMode) {
-    if (::occa::modeIsEnabled("CUDA")) {
-      return "CUDA";
-    }
-    if (::occa::modeIsEnabled("HIP")) {
-      return "HIP";
-    }
-    if (::occa::modeIsEnabled("dpcpp")) {
-      return "dpcpp";
-    }
-    if (::occa::modeIsEnabled("OpenCL")) {
-      return "OpenCL";
-    }
-    // Metal doesn't support doubles
-  }
-
-  if (cpuMode) {
-    if (::occa::modeIsEnabled("OpenMP")) {
-      return "OpenMP";
-    }
-    return "Serial";
-  }
-
-  return "";
-}
-
-static int getDeviceMode(const std::string &match, std::string &mode) {
-  if (match == "cuda") {
-    mode = "CUDA";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (match == "hip") {
-    mode = "HIP";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (match == "dpcpp") {
-    mode = "dpcpp";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (match == "opencl") {
-    mode = "OpenCL";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (match == "openmp") {
-    mode = "OpenMP";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (match == "serial") {
-    mode = "Serial";
-    return CEED_ERROR_SUCCESS;
-  }
-
-  const bool autoMode = match == "*";
-  const bool cpuMode  = match == "cpu";
-  const bool gpuMode  = match == "gpu";
-
-  mode = getDefaultDeviceMode(cpuMode || autoMode, gpuMode || autoMode);
-  return !mode.size();
-}
-
-static int splitCeedResource(const std::string &resource, std::string &match, StringMap &query) {
-  /*
-   * resource:
-   *
-   *    "/gpu/occa?mode='CUDA':device_id=0"
-   *
-   * resourceVector:
-   *
-   *    ["gpu", "occa"]
-   *
-   * match:
-   *
-   *    "gpu"
-   *
-   * query:
-   *
-   *    {
-   *      "mode": "'CUDA'",
-   *      "device_id": "0",
-   *    }
-   */
-  const int   charCount  = (int)resource.size();
-  const char *c_resource = resource.c_str();
-
-  StringVector resourceVector;
-
-  ResourceParserStep parsingStep = RESOURCE;
-  int                wordStart   = 1;
-  std::string        queryKey;
-
-  // Check for /gpu/cuda/occa, /gpu/hip/occa, /cpu/self/occa, /cpu/openmp/occa
-  // Note: added for matching style with other backends
-  if (resource == "/gpu/cuda/occa") {
-    match = "cuda";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (resource == "/gpu/hip/occa") {
-    match = "hip";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (resource == "/gpu/dpcpp/occa") {
-    match = "dpcpp";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (resource == "/gpu/opencl/occa") {
-    match = "opencl";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (resource == "/cpu/openmp/occa") {
-    match = "openmp";
-    return CEED_ERROR_SUCCESS;
-  }
-  if (resource == "/cpu/self/occa") {
-    match = "serial";
-    return CEED_ERROR_SUCCESS;
-  }
-
-  // Skip initial slash
-  for (int i = 1; i <= charCount; ++i) {
-    const char c = c_resource[i];
-
-    if (parsingStep == RESOURCE) {
-      if (c == RESOURCE_DELIMITER || c == QUERY_DELIMITER || c == '\0') {
-        resourceVector.push_back(resource.substr(wordStart, i - wordStart));
-        wordStart = i + 1;
-
-        // Check if we are done parsing the resource
-        if (c == QUERY_DELIMITER) {
-          parsingStep = QUERY_KEY;
-        }
-      }
-    } else if (parsingStep == QUERY_KEY) {
-      if (c == QUERY_KEY_VALUE_DELIMITER) {
-        queryKey  = resource.substr(wordStart, i - wordStart);
-        wordStart = i + 1;
-
-        // Looking to parse the query value now
-        parsingStep = QUERY_VALUE;
-      }
-    } else if (parsingStep == QUERY_VALUE) {
-      if (c == QUERY_ARG_DELIMITER || c == '\0') {
-        query[queryKey] = resource.substr(wordStart, i - wordStart);
-        wordStart       = i + 1;
-
-        // Back to parsing the next query argument
-        parsingStep = QUERY_KEY;
-        queryKey    = "";
-      }
-    }
-  }
-
-  // Looking for [match, "occa"]
-  if (resourceVector.size() != 2 || resourceVector[1] != "occa") {
-    return 1;
-  }
-
-  match = resourceVector[0];
-  return CEED_ERROR_SUCCESS;
-}
-
-void setDefaultProps(::occa::properties &deviceProps, const std::string &defaultMode) {
-  std::string mode;
-  if (deviceProps.has("mode")) {
-    // Don't override mode if passed
-    mode = (std::string)deviceProps["mode"];
-  } else {
-    mode = defaultMode;
-    deviceProps.set("mode", mode);
-  }
-
-  // Set default device id
-  if ((mode == "CUDA") || (mode == "HIP") || (mode == "dpcpp") || (mode == "OpenCL")) {
-    if (!deviceProps.has("device_id")) {
-      deviceProps["device_id"] = 0;
-    }
-  }
-
-  // Set default platform id
-  if ((mode == "dpcpp") || (mode == "OpenCL")) {
-    if (!deviceProps.has("platform_id")) {
-      deviceProps["platform_id"] = 0;
-    }
-  }
-}
-
-static int initCeed(const char *c_resource, Ceed ceed) {
-  int         ierr;
-  std::string match;
-  StringMap   query;
-
-  ierr = splitCeedResource(c_resource, match, query);
-  if (ierr) {
-    return CeedError(ceed, CEED_ERROR_BACKEND, "(OCCA) Backend cannot use resource: %s", c_resource);
-  }
-
-  std::string mode;
-  ierr = getDeviceMode(match, mode);
-  if (ierr) {
-    return CeedError(ceed, CEED_ERROR_BACKEND, "(OCCA) Backend cannot use resource: %s", c_resource);
-  }
-
-  std::string               devicePropsStr = "{\n";
-  StringMap::const_iterator it;
-  for (it = query.begin(); it != query.end(); ++it) {
-    devicePropsStr += "  \"";
-    devicePropsStr += it->first;
-    devicePropsStr += "\": ";
-    devicePropsStr += it->second;
-    devicePropsStr += ",\n";
-  }
-  devicePropsStr += '}';
-
-  ::occa::properties deviceProps(devicePropsStr);
-  setDefaultProps(deviceProps, mode);
-
-  ceed::occa::Context *context = new Context(::occa::device(deviceProps));
-  CeedCallBackend(CeedSetData(ceed, context));
-
-  return CEED_ERROR_SUCCESS;
-}
-
-static int destroyCeed(Ceed ceed) {
-  delete Context::from(ceed);
-  return CEED_ERROR_SUCCESS;
-}
-
-static int registerCeedFunction(Ceed ceed, const char *fname, ceed::occa::ceedFunction f) {
-  return CeedSetBackendFunction(ceed, "Ceed", ceed, fname, f);
-}
-
-static int preferHostMemType(CeedMemType *type) {
-  *type = CEED_MEM_HOST;
-  return CEED_ERROR_SUCCESS;
-}
-
-static int preferDeviceMemType(CeedMemType *type) {
-  *type = CEED_MEM_DEVICE;
-  return CEED_ERROR_SUCCESS;
-}
-
-static ceed::occa::ceedFunction getPreferredMemType(Ceed ceed) {
-  if (Context::from(ceed)->device.hasSeparateMemorySpace()) {
-    return (ceed::occa::ceedFunction)(void *)preferDeviceMemType;
-  }
-  return (ceed::occa::ceedFunction)(void *)preferHostMemType;
-}
-
-static int registerMethods(Ceed ceed) {
-  CeedOccaRegisterBaseFunction("Destroy", ceed::occa::destroyCeed);
-  CeedOccaRegisterBaseFunction("GetPreferredMemType", getPreferredMemType(ceed));
-  CeedOccaRegisterBaseFunction("VectorCreate", ceed::occa::Vector::ceedCreate);
-  CeedOccaRegisterBaseFunction("BasisCreateTensorH1", ceed::occa::TensorBasis::ceedCreate);
-  CeedOccaRegisterBaseFunction("BasisCreateH1", ceed::occa::SimplexBasis::ceedCreate);
-  CeedOccaRegisterBaseFunction("ElemRestrictionCreate", ceed::occa::ElemRestriction::ceedCreate);
-  CeedOccaRegisterBaseFunction("QFunctionCreate", ceed::occa::QFunction::ceedCreate);
-  CeedOccaRegisterBaseFunction("QFunctionContextCreate", ceed::occa::QFunctionContext::ceedCreate);
-  CeedOccaRegisterBaseFunction("OperatorCreate", ceed::occa::Operator::ceedCreate);
-  CeedOccaRegisterBaseFunction("CompositeOperatorCreate", ceed::occa::Operator::ceedCreateComposite);
-
-  return CEED_ERROR_SUCCESS;
-}
-
-static int registerBackend(const char *resource, Ceed ceed) {
-  try {
-    CeedCallBackend(ceed::occa::initCeed(resource, ceed));
-  } catch (const ::occa::exception &e) {
-    CeedHandleOccaException(e);
-  }
-  try {
-    CeedCallBackend(ceed::occa::registerMethods(ceed));
-  } catch (const ::occa::exception &e) {
-    CeedHandleOccaException(e);
-  }
-  return CEED_ERROR_SUCCESS;
-}
-}  // namespace occa
-}  // namespace ceed
-
-CEED_INTERN int CeedRegister_Occa(void) {
-  // General mode
-  CeedCallBackend(CeedRegister("/*/occa", ceed::occa::registerBackend, 270));
-  // CPU Modes
-  CeedCallBackend(CeedRegister("/cpu/self/occa", ceed::occa::registerBackend, 260));
-  CeedCallBackend(CeedRegister("/cpu/openmp/occa", ceed::occa::registerBackend, 250));
-  // GPU Modes
-  CeedCallBackend(CeedRegister("/gpu/dpcpp/occa", ceed::occa::registerBackend, 240));
-  CeedCallBackend(CeedRegister("/gpu/opencl/occa", ceed::occa::registerBackend, 230));
-  CeedCallBackend(CeedRegister("/gpu/hip/occa", ceed::occa::registerBackend, 220));
-  CeedCallBackend(CeedRegister("/gpu/cuda/occa", ceed::occa::registerBackend, 210));
-  return CEED_ERROR_SUCCESS;
-}
diff --git a/backends/occa/ceed-occa.h b/backends/occa/ceed-occa.h
deleted file mode 100644
index d9dc78ebd4..0000000000
--- a/backends/occa/ceed-occa.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include <ceed.h>
-#include <ceed/backend.h>
-#include <assert.h>
-#include <stdbool.h>
-#include <string.h>
-#include <sys/stat.h>
-
-// *****************************************************************************
-#define OCCA_PATH_MAX 4096
-
-// *****************************************************************************
-// used to get Dl_info struct declaration (vs _GNU_SOURCE?)
-#ifndef __USE_GNU
-#define __USE_GNU
-#endif
-#include <dlfcn.h>
-
-// *****************************************************************************
-#include "occa.h"
-
-// *****************************************************************************
-#define NO_OFFSET 0
-#define TILE_SIZE 32
-#define NO_PROPS occaDefault
-
-// *****************************************************************************
-// * CeedVector Occa struct
-// *****************************************************************************
-typedef struct {
-  CeedScalar *h_array;
-  CeedScalar *h_array_allocated;
-  occaMemory  d_array;
-} CeedVector_Occa;
-
-// *****************************************************************************
-// * CeedElemRestriction Occa struct
-// *****************************************************************************
-#define CEED_OCCA_NUM_RESTRICTION_KERNELS 8
-typedef struct {
-  bool       strided;
-  occaMemory d_indices;
-  occaMemory d_toffsets;
-  occaMemory d_tindices;
-  occaKernel kRestrict[CEED_OCCA_NUM_RESTRICTION_KERNELS];
-} CeedElemRestriction_Occa;
-
-// *****************************************************************************
-// * CeedBasis Occa struct
-// *****************************************************************************
-typedef struct {
-  bool                ready;
-  CeedElemRestriction er;
-  occaMemory          qref1d;
-  occaMemory          qweight1d;
-  occaMemory          interp1d;
-  occaMemory          grad1d;
-  occaMemory          tmp0, tmp1;
-  occaKernel          kZero, kInterp, kGrad, kWeight;
-} CeedBasis_Occa;
-
-// *****************************************************************************
-// * CeedOperator Occa struct
-// *****************************************************************************
-typedef struct {
-  CeedVector  *Evecs;  /// E-vectors needed to apply operator (in followed by out)
-  CeedScalar **Edata;
-  CeedVector  *evecsin;   /// Input E-vectors needed to apply operator
-  CeedVector  *evecsout;  /// Output E-vectors needed to apply operator
-  CeedVector  *qvecsin;   /// Input Q-vectors needed to apply operator
-  CeedVector  *qvecsout;  /// Output Q-vectors needed to apply operator
-  CeedInt      numein;
-  CeedInt      numeout;
-} CeedOperator_Occa;
-
-// *****************************************************************************
-// * CeedQFunction Occa struct
-// *****************************************************************************
-#define N_MAX_IDX 16
-typedef struct {
-  bool         ready;
-  CeedInt      idx, odx;
-  CeedInt      iOf7[N_MAX_IDX];
-  CeedInt      oOf7[N_MAX_IDX];
-  int          nc, dim, nelem, elemsize, e;
-  occaMemory   o_indata, o_outdata;
-  occaMemory   d_ctx, d_idx, d_odx;
-  char        *oklPath;
-  const char  *qFunctionName;
-  occaKernel   kQFunctionApply;
-  CeedOperator op;
-} CeedQFunction_Occa;
-
-// *****************************************************************************
-// * CeedQFunctionContext Occa struct
-// *****************************************************************************
-typedef struct {
-  CeedScalar *h_data;
-  CeedScalar *h_data_allocated;
-} CeedQFunctionContext_Occa;
-
-// *****************************************************************************
-// * Ceed Occa struct
-// *****************************************************************************
-typedef struct {
-  occaDevice device;
-  bool       ocl;
-  char      *libceed_dir;
-  char      *occa_cache_dir;
-} Ceed_Occa;
-
-// *****************************************************************************
-CEED_INTERN int CeedOklPath_Occa(const Ceed, const char *, const char *, char **);
-
-// *****************************************************************************
-CEED_INTERN int CeedOklDladdr_Occa(Ceed);
-
-// *****************************************************************************
-CEED_INTERN int CeedBasisCreateTensorH1_Occa(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d,
-                                             const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis);
-
-// *****************************************************************************
-CEED_INTERN int CeedBasisCreateH1_Occa(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp1d,
-                                       const CeedScalar *grad1d, const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis);
-
-// *****************************************************************************
-CEED_INTERN int CeedBasisApplyElems_Occa(CeedBasis basis, CeedInt Q, CeedTransposeMode tmode, CeedEvalMode emode, const CeedVector u, CeedVector v);
-
-// *****************************************************************************
-CEED_INTERN int CeedOperatorCreate_Occa(CeedOperator op);
-
-// *****************************************************************************
-CEED_INTERN int CeedQFunctionCreate_Occa(CeedQFunction qf);
-
-// *****************************************************************************
-CEED_INTERN int CeedQFunctionContextCreate_Occa(CeedQFunctionContext ctx);
-
-// *****************************************************************************
-CEED_INTERN int CeedElemRestrictionCreate_Occa(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *indices, const bool *orients,
-                                               const CeedInt8 *curl_orients, const CeedElemRestriction res);
-
-// *****************************************************************************
-CEED_INTERN int CeedVectorCreate_Occa(CeedInt n, CeedVector vec);
diff --git a/backends/occa/kernels/elem-restriction.cpp b/backends/occa/kernels/elem-restriction.cpp
deleted file mode 100644
index 947556be1f..0000000000
--- a/backends/occa/kernels/elem-restriction.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "./kernel-defines.hpp"
-
-// Kernels are based on the cuda backend from LLNL and VT groups
-//
-// Expects the following types to be defined:
-// - CeedInt
-// - CeedScalar
-//
-// Expects the following constants to be defined:
-// - COMPONENT_COUNT            : CeedInt
-// - ELEMENT_SIZE               : CeedInt
-// - NODE_COUNT                 : CeedInt
-// - TILE_SIZE                  : int
-// - USES_INDICES               : bool
-// - STRIDE_TYPE                : ceed::occa::StrideType
-// - NODE_STRIDE                : Optional[CeedInt]
-// - COMPONENT_STRIDE           : Optional[CeedInt]
-// - ELEMENT_STRIDE             : Optional[CeedInt]
-// - UNSTRIDED_COMPONENT_STRIDE : Optional[CeedInt]
-
-const char *occa_elem_restriction_source = STRINGIFY_SOURCE(
-
-    @directive("#define PRINT_KERNEL_HASHES 0")
-
-            typedef CeedScalar *
-        QuadVector @dim(ELEMENT_SIZE, COMPONENT_COUNT, elementCount);
-
-    @kernel void applyRestriction(const CeedInt elementCount, const CeedInt *indices, CeedScalar *u, QuadVector v) {
-      @tile(TILE_SIZE, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
-        @directive("#if PRINT_KERNEL_HASHES")
-            // Print to see which kernel is being run
-            if (element == 0) {
-          printf("\n\napplyRestriction Kernel: " OKL_KERNEL_HASH "\n\n");
-        }
-        @directive("#endif")
-
-            @directive("#if USES_INDICES") for (int node = 0; node < ELEMENT_SIZE; ++node) {
-          const CeedInt index = indices[node + (element * ELEMENT_SIZE)];
-
-          for (int c = 0; c < COMPONENT_COUNT; ++c) {
-            v(node, c, element) = u[index + (c * UNSTRIDED_COMPONENT_STRIDE)];
-          }
-        }
-        @directive("#else") for (int node = 0; node < ELEMENT_SIZE; ++node) {
-          for (int c = 0; c < COMPONENT_COUNT; ++c) {
-            v(node, c, element) = u[(node * NODE_STRIDE) + (c * COMPONENT_STRIDE) + (element * ELEMENT_STRIDE)];
-          }
-        }
-        @directive("#endif")
-      }
-    }
-
-    @directive("#if USES_INDICES")
-
-        @kernel void applyRestrictionTranspose(const CeedInt elementCount, const CeedInt *quadIndices, const CeedInt *dofOffsets,
-                                               const CeedInt *dofIndices, const QuadVector u, CeedScalar *v) {
-          @tile(TILE_SIZE, @outer, @inner) for (int n = 0; n < NODE_COUNT; ++n) {
-            @directive("#if PRINT_KERNEL_HASHES")
-                // Print to see which kernel is being run
-                if (n == 0) {
-              printf("\n\napplyRestrictionTranspose Kernel: " OKL_KERNEL_HASH "\n\n");
-            }
-            @directive("#endif")
-
-                CeedScalar vComp[COMPONENT_COUNT];
-
-            // Prefetch index information
-            const CeedInt vIndex      = quadIndices[n];
-            const CeedInt offsetStart = dofOffsets[n];
-            const CeedInt offsetEnd   = dofOffsets[n + 1];
-
-            for (int c = 0; c < COMPONENT_COUNT; ++c) {
-              vComp[c] = 0;
-            }
-
-            // Aggregate by component
-            for (CeedInt i = offsetStart; i < offsetEnd; ++i) {
-              const CeedInt index = dofIndices[i];
-
-              const int node    = (index % ELEMENT_SIZE);
-              const int element = (index / ELEMENT_SIZE);
-
-              for (int c = 0; c < COMPONENT_COUNT; ++c) {
-                vComp[c] += u(node, c, element);
-              }
-            }
-
-            // Update dofs by component
-            for (int c = 0; c < COMPONENT_COUNT; ++c) {
-              v[vIndex + (c * UNSTRIDED_COMPONENT_STRIDE)] += vComp[c];
-            }
-          }
-        }
-
-    @directive("#else")  // USES_INDICES = false
-
-    @kernel void applyRestrictionTranspose(const CeedInt elementCount, const CeedInt *quadIndices, const CeedInt *dofOffsets,
-                                           const CeedInt *dofIndices, const QuadVector u, CeedScalar *v) {
-      @tile(TILE_SIZE, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
-        @directive("#if PRINT_KERNEL_HASHES")
-            // Print to see which kernel is being run
-            if (element == 0) {
-          printf("\n\napplyRestrictionTranspose Kernel: " OKL_KERNEL_HASH "\n\n");
-        }
-        @directive("#endif")
-
-            for (int node = 0; node < ELEMENT_SIZE; ++node) {
-          for (int c = 0; c < COMPONENT_COUNT; ++c) {
-            v[(node * NODE_STRIDE) + (c * COMPONENT_STRIDE) + (element * ELEMENT_STRIDE)] += u(node, c, element);
-          }
-        }
-      }
-    }
-
-    @directive("#endif")  // USES_INDICES
-
-);
diff --git a/backends/occa/kernels/elem-restriction.hpp b/backends/occa/kernels/elem-restriction.hpp
deleted file mode 100644
index ac45de6c49..0000000000
--- a/backends/occa/kernels/elem-restriction.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_KERNELS_ELEMRESTRICTION_HEADER
-#define CEED_OCCA_KERNELS_ELEMRESTRICTION_HEADER
-
-// Kernels are based on the cuda backend from LLNL and VT groups
-//
-// Expects the following types to be defined:
-// - CeedInt
-// - CeedScalar
-//
-// Expects the following constants to be defined:
-// - COMPONENT_COUNT            : CeedInt
-// - ELEMENT_SIZE               : CeedInt
-// - NODE_COUNT                 : CeedInt
-// - TILE_SIZE                  : int
-// - USES_INDICES               : bool
-// - STRIDE_TYPE                : ceed::occa::StrideType
-// - NODE_STRIDE                : Optional[CeedInt]
-// - COMPONENT_STRIDE           : Optional[CeedInt]
-// - ELEMENT_STRIDE             : Optional[CeedInt]
-// - UNSTRIDED_COMPONENT_STRIDE : Optional[CeedInt]
-
-extern const char *occa_elem_restriction_source;
-
-#endif
diff --git a/backends/occa/kernels/set-value.cpp b/backends/occa/kernels/set-value.cpp
deleted file mode 100644
index a7a756e442..0000000000
--- a/backends/occa/kernels/set-value.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "./kernel-defines.hpp"
-
-// Expects the following types to be defined:
-// - CeedInt
-// - CeedScalar
-//
-// Expects the following constants to be defined:
-// - BLOCK_SIZE : CeedInt
-
-const char *occa_set_value_source = STRINGIFY_SOURCE(
-
-    @kernel void setValue(CeedScalar *ptr, const CeedScalar value, const CeedInt count) {
-      @tile(BLOCK_SIZE, @outer, @inner) for (CeedInt i = 0; i < count; ++i) {
-        ptr[i] = value;
-      }
-    });
diff --git a/backends/occa/kernels/set-value.hpp b/backends/occa/kernels/set-value.hpp
deleted file mode 100644
index fa5303f5f0..0000000000
--- a/backends/occa/kernels/set-value.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_KERNELS_SETVALUE_HEADER
-#define CEED_OCCA_KERNELS_SETVALUE_HEADER
-
-// Expects the following types to be defined:
-// - CeedInt
-// - CeedScalar
-//
-// Expects the following constants to be defined:
-// - BLOCK_SIZE : CeedInt
-
-extern const char *occa_set_value_source;
-
-#endif
\ No newline at end of file
diff --git a/backends/occa/kernels/simplex-basis.hpp b/backends/occa/kernels/simplex-basis.hpp
deleted file mode 100644
index 4f53e5c6dd..0000000000
--- a/backends/occa/kernels/simplex-basis.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_KERNELS_SIMPLEXBASIS_HEADER
-#define CEED_OCCA_KERNELS_SIMPLEXBASIS_HEADER
-
-// Kernels are based on the cuda backend from LLNL and VT groups
-//
-// Expects the following types to be defined:
-// - CeedInt
-// - CeedScalar
-//
-// Expects the following constants to be defined:
-// - DIM                  : CeedInt
-// - Q                    : CeedInt
-// - P                    : CeedInt
-// - MAX_PQ               : CeedInt
-// - BASIS_COMPONENT_COUNT: CeedInt
-// - ELEMENTS_PER_BLOCK   : CeedInt
-// - TRANSPOSE            : bool
-
-extern const char *occa_simplex_basis_cpu_function_source;
-extern const char *occa_simplex_basis_cpu_kernel_source;
-
-extern const char *occa_simplex_basis_gpu_source;
-
-#endif
diff --git a/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp b/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp
deleted file mode 100644
index 39a36684c2..0000000000
--- a/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../kernel-defines.hpp"
-
-const char *occa_simplex_basis_cpu_function_source = STRINGIFY_SOURCE(
-
-    @directive("#define SIMPLEX_FUNCTION(FUNCTION_NAME) simplex_ ## DIM ## d_ ## FUNCTION_NAME ## _Q ## Q ## _P ## P")
-
-        inline void SIMPLEX_FUNCTION(interpElement)(const CeedScalar *B @dim(P, Q), const CeedScalar *Ue, CeedScalar *Ve) {
-          for (int q = 0; q < Q; ++q) {
-            CeedScalar v = 0;
-            for (int p = 0; p < P; ++p) {
-              v += B(p, q) * Ue[p];
-            }
-            Ve[q] = v;
-          }
-        }
-
-    inline void SIMPLEX_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P, Q), const CeedScalar *Ue, CeedScalar *Ve) {
-      for (int p = 0; p < P; ++p) {
-        CeedScalar v = 0;
-        for (int q = 0; q < Q; ++q) {
-          v += B(p, q) * Ue[q];
-        }
-        Ve[p] = v;
-      }
-    }
-
-    inline void SIMPLEX_FUNCTION(gradElement)(const CeedScalar *Bx @dim(P, Q, DIM), const CeedScalar *Ue, CeedScalar *Ve, ) {
-      for (int q = 0; q < Q; ++q) {
-        CeedScalar v[DIM];
-        for (int dim = 0; dim < DIM; ++dim) {
-          v[dim] = 0;
-        }
-
-        for (int p = 0; p < P; ++p) {
-          const CeedScalar u = Ue[p];
-          for (int dim = 0; dim < DIM; ++dim) {
-            v[dim] += Bx(p, q, dim) * u;
-          }
-        }
-
-        for (int dim = 0; dim < DIM; ++dim) {
-          Ve[dim * Q + q] = v[dim];
-        }
-      }
-    }
-
-    inline void SIMPLEX_FUNCTION(gradElementTranspose)(const CeedScalar *Bx @dim(P, Q, DIM), const CeedScalar *Ue, CeedScalar *Ve) {
-      for (int p = 0; p < P; ++p) {
-        CeedScalar v = 0;
-        for (int dim = 0; dim < DIM; ++dim) {
-          for (int q = 0; q < Q; ++q) {
-            v += Bx(p, q, dim) * Ue[dim * Q + q];
-          }
-        }
-        Ve[p] = v;
-      }
-    }
-
-    inline void SIMPLEX_FUNCTION(weightElement)(const CeedScalar *qWeights, CeedScalar *We) {
-      for (int q = 0; q < Q; ++q) {
-        We[q] = qWeights[q];
-      }
-    }
-
-);
-
-const char *occa_simplex_basis_cpu_kernel_source = STRINGIFY_SOURCE(
-
-    @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P, BASIS_COMPONENT_COUNT, elementCount) = U;
-            CeedScalar       *Ve @dim(Q, elementCount, BASIS_COMPONENT_COUNT) = V;
-
-            SIMPLEX_FUNCTION(interpElement)(B, &Ue(0, component, element), &Ve(0, element, component));
-          } else {
-            const CeedScalar *Ue @dim(Q, elementCount, BASIS_COMPONENT_COUNT) = U;
-            CeedScalar       *Ve @dim(P, BASIS_COMPONENT_COUNT, elementCount) = V;
-
-            SIMPLEX_FUNCTION(interpElementTranspose)(B, &Ue(0, element, component), &Ve(0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P, BASIS_COMPONENT_COUNT, elementCount)       = U;
-            CeedScalar       *_Ve @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM) = V;
-
-            CeedScalar Ve[DIM][Q];
-            for (int dim = 0; dim < DIM; ++dim) {
-              for (int q = 0; q < Q; ++q) {
-                Ve[dim][q] = _Ve(q, element, component, dim);
-              }
-            }
-
-            SIMPLEX_FUNCTION(gradElement)(Bx, &Ue(0, component, element), (CeedScalar *)Ve);
-
-            for (int dim = 0; dim < DIM; ++dim) {
-              for (int q = 0; q < Q; ++q) {
-                _Ve(q, element, component, dim) = Ve[dim][q];
-              }
-            }
-          } else {
-            const CeedScalar *_Ue @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM) = U;
-            CeedScalar       *Ve @dim(P, BASIS_COMPONENT_COUNT, elementCount)       = V;
-
-            CeedScalar Ue[DIM][Q];
-            for (int dim = 0; dim < DIM; ++dim) {
-              for (int q = 0; q < Q; ++q) {
-                Ue[dim][q] = _Ue(q, element, component, dim);
-              }
-            }
-
-            SIMPLEX_FUNCTION(gradElementTranspose)(Bx, (CeedScalar *)Ue, &Ve(0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights, CeedScalar *W @dim(Q, elementCount)) {
-      @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
-        SIMPLEX_FUNCTION(weightElement)(qWeights, &W(0, element));
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp b/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp
deleted file mode 100644
index aa09fa60d7..0000000000
--- a/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../kernel-defines.hpp"
-
-const char *occa_simplex_basis_gpu_source = STRINGIFY_SOURCE(
-
-    @directive("#if TRANSPOSE") typedef CeedScalar * dofArray @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM);
-    typedef CeedScalar * quadArray @dim(P, BASIS_COMPONENT_COUNT, elementCount, DIM);
-    @directive("#else") typedef CeedScalar * dofArray @dim(P, BASIS_COMPONENT_COUNT, elementCount, DIM);
-    typedef CeedScalar * quadArray @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM); @directive("#endif")
-
-                                                                                          typedef CeedScalar *
-                                                                                      quadToDof @dim(P, Q);
-    typedef CeedScalar * dQuadToDof @dim(P, Q, DIM); typedef CeedScalar * elementWeightArray @dim(Q, elementCount);
-
-    @kernel void interp(const CeedInt elementCount, const quadToDof B, const dofArray U, quadArray V) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar s_B[P * Q] @dim(P, Q);
-
-        // Store weights in shared memory
-        for (int i = 0; i < MAX_PQ; ++i; @inner) {
-          for (int j = i; j < (P * Q); j += MAX_PQ) {
-            s_B[j] = B[j];
-          }
-        }
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) {
-          for (int i = 0; i < MAX_PQ; ++i; @inner) {
-            const int element = elementOffset + localElement;
-            if (element < elementCount) {
-              // Element operation
-              for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) {
-                if (!TRANSPOSE) {
-                  const int q = i;
-                  if (q < Q) {
-                    CeedScalar v = 0;
-                    for (int p = 0; p < P; ++p) {
-                      v += s_B(p, q) * U(p, component, element, 0);
-                    }
-                    V(q, element, component, 0) = v;
-                  }
-                } else {
-                  const int p = i;
-                  if (p < P) {
-                    CeedScalar v = 0;
-                    for (int q = 0; q < Q; ++q) {
-                      v += s_B(p, q) * U(q, element, component, 0);
-                    }
-                    V(p, component, element, 0) = v;
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, const dQuadToDof Bx, const dofArray U, quadArray V) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar s_Bx[Q * P * DIM] @dim(P, Q, DIM);
-
-        // Store weights in shared memory
-        for (int i = 0; i < MAX_PQ; ++i; @inner) {
-          for (int j = i; j < (P * Q * DIM); j += MAX_PQ) {
-            s_Bx[j] = Bx[j];
-          }
-        }
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) {
-          for (int i = 0; i < MAX_PQ; ++i; @inner) {
-            const int element = elementOffset + localElement;
-            if (element < elementCount) {
-              // Element operation
-              for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) {
-                if (!TRANSPOSE) {
-                  const int q = i;
-                  if (q < Q) {
-                    CeedScalar v[DIM];
-                    for (int dim = 0; dim < DIM; ++dim) {
-                      v[dim] = 0;
-                    }
-
-                    for (int p = 0; p < P; ++p) {
-                      const CeedScalar u = U(p, component, element, 0);
-                      for (int dim = 0; dim < DIM; ++dim) {
-                        v[dim] += s_Bx(p, q, dim) * u;
-                      }
-                    }
-
-                    for (int dim = 0; dim < DIM; ++dim) {
-                      V(q, element, component, dim) = v[dim];
-                    }
-                  }
-                } else {
-                  const int p = i;
-                  if (p < P) {
-                    CeedScalar v = 0;
-                    for (int dim = 0; dim < DIM; ++dim) {
-                      for (int q = 0; q < Q; ++q) {
-                        v += s_Bx(p, q, dim) * U(q, element, component, dim);
-                      }
-                    }
-                    V(p, component, element, 0) = v;
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights, elementWeightArray W) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar s_qWeights[Q];
-
-        for (int q = 0; q < Q; ++q; @inner) {
-          s_qWeights[q] = qWeights[q];
-        }
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) {
-          const int element = elementOffset + localElement;
-          if (element < elementCount) {
-            for (int q = 0; q < Q; ++q; @inner) {
-              W(q, element) = s_qWeights[q];
-            }
-          }
-        }
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/tensor-basis.hpp b/backends/occa/kernels/tensor-basis.hpp
deleted file mode 100644
index 54850a7830..0000000000
--- a/backends/occa/kernels/tensor-basis.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#ifndef CEED_OCCA_KERNELS_TENSORBASIS_HEADER
-#define CEED_OCCA_KERNELS_TENSORBASIS_HEADER
-
-// Kernels are based on the cuda backend from LLNL and VT groups
-//
-// Expects the following types to be defined:
-// - CeedInt
-// - CeedScalar
-//
-// Expects the following constants to be defined:
-// - Q1D                  : CeedInt
-// - P1D                  : CeedInt
-// - BASIS_COMPONENT_COUNT: CeedInt
-// - ELEMENTS_PER_BLOCK   : CeedInt
-// - SHARED_BUFFER_SIZE   : CeedInt
-// - TRANSPOSE            : bool
-
-extern const char *occa_tensor_basis_1d_cpu_function_source;
-extern const char *occa_tensor_basis_1d_cpu_kernel_source;
-
-extern const char *occa_tensor_basis_2d_cpu_function_source;
-extern const char *occa_tensor_basis_2d_cpu_kernel_source;
-
-extern const char *occa_tensor_basis_3d_cpu_function_source;
-extern const char *occa_tensor_basis_3d_cpu_kernel_source;
-
-extern const char *occa_tensor_basis_1d_gpu_source;
-extern const char *occa_tensor_basis_2d_gpu_source;
-extern const char *occa_tensor_basis_3d_gpu_source;
-
-#endif
diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp
deleted file mode 100644
index 90c9e905d5..0000000000
--- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../kernel-defines.hpp"
-
-const char *occa_tensor_basis_1d_cpu_function_source = STRINGIFY_SOURCE(
-
-    @directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_1d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D")
-
-        inline void TENSOR_FUNCTION(interpElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue, CeedScalar *Ve) {
-          for (int q = 0; q < Q1D; ++q) {
-            CeedScalar Vq = 0;
-            for (int p = 0; p < P1D; ++p) {
-              Vq += B(p, q) * Ue[p];
-            }
-            Ve[q] = Vq;
-          }
-        }
-
-    inline void TENSOR_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue, CeedScalar *Ve) {
-      for (int p = 0; p < P1D; ++p) {
-        CeedScalar Vp = 0;
-        for (int q = 0; q < Q1D; ++q) {
-          Vp += B(p, q) * Ue[q];
-        }
-        Ve[p] = Vp;
-      }
-    }
-
-    inline void TENSOR_FUNCTION(gradElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), const CeedScalar *Ue,
-                                             CeedScalar *Ve) {
-      for (int q = 0; q < Q1D; ++q) {
-        CeedScalar Vq = 0;
-        for (int p = 0; p < P1D; ++p) {
-          Vq += Bx(p, q) * Ue[p];
-        }
-        Ve[q] = Vq;
-      }
-    }
-
-    inline void TENSOR_FUNCTION(gradElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), const CeedScalar *Ue,
-                                                      CeedScalar *Ve) {
-      for (int p = 0; p < P1D; ++p) {
-        CeedScalar Vp = 0;
-        for (int q = 0; q < Q1D; ++q) {
-          Vp += Bx(p, q) * Ue[q];
-        }
-        Ve[p] = Vp;
-      }
-    }
-
-    inline void TENSOR_FUNCTION(weightElement)(const CeedScalar *qWeights1D, CeedScalar *We) {
-      for (int q = 0; q < Q1D; ++q) {
-        We[q] = qWeights1D[q];
-      }
-    }
-
-);
-
-const char *occa_tensor_basis_1d_cpu_kernel_source = STRINGIFY_SOURCE(
-
-    @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = U;
-            CeedScalar       *Ve @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = V;
-
-            TENSOR_FUNCTION(interpElement)(B, &Ue(0, component, element), &Ve(0, element, component));
-          } else {
-            const CeedScalar *Ue @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = U;
-            CeedScalar       *Ve @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = V;
-
-            TENSOR_FUNCTION(interpElementTranspose)(B, &Ue(0, element, component), &Ve(0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = U;
-            CeedScalar       *Ve @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = V;
-
-            TENSOR_FUNCTION(gradElement)(B, Bx, &Ue(0, component, element), &Ve(0, element, component));
-          } else {
-            const CeedScalar *Ue @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = U;
-            CeedScalar       *Ve @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = V;
-
-            TENSOR_FUNCTION(gradElementTranspose)(B, Bx, &Ue(0, element, component), &Ve(0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, CeedScalar *W @dim(Q1D, elementCount)) {
-      @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
-        TENSOR_FUNCTION(weightElement)(qWeights1D, &W(0, element));
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp
deleted file mode 100644
index d150129584..0000000000
--- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../kernel-defines.hpp"
-
-const char *occa_tensor_basis_2d_cpu_function_source = STRINGIFY_SOURCE(
-
-    @directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_2d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D")
-
-        inline void TENSOR_FUNCTION(interpElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(P1D, P1D),
-                                                   CeedScalar *Ve @dim(Q1D, Q1D)) {
-          for (int qy = 0; qy < Q1D; ++qy) {
-            for (int qx = 0; qx < Q1D; ++qx) {
-              Ve(qx, qy) = 0;
-            }
-          }
-
-          for (int py = 0; py < P1D; ++py) {
-            CeedScalar V_x[Q1D];
-            for (int qx = 0; qx < Q1D; ++qx) {
-              V_x[qx] = 0;
-            }
-
-            for (int px = 0; px < P1D; ++px) {
-              const CeedScalar Up = Ue(px, py);
-              for (int qx = 0; qx < Q1D; ++qx) {
-                V_x[qx] += B(px, qx) * Up;
-              }
-            }
-
-            for (int qy = 0; qy < Q1D; ++qy) {
-              const CeedScalar w = B(py, qy);
-              for (int qx = 0; qx < Q1D; ++qx) {
-                Ve(qx, qy) += w * V_x[qx];
-              }
-            }
-          }
-        }
-
-    inline void TENSOR_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(Q1D, Q1D),
-                                                        CeedScalar *Ve @dim(P1D, P1D)) {
-      for (int py = 0; py < P1D; ++py) {
-        for (int px = 0; px < P1D; ++px) {
-          Ve(px, py) = 0;
-        }
-      }
-
-      for (int qy = 0; qy < Q1D; ++qy) {
-        CeedScalar V_x[P1D];
-        for (int py = 0; py < P1D; ++py) {
-          V_x[py] = 0;
-        }
-
-        for (int qx = 0; qx < Q1D; ++qx) {
-          const CeedScalar Up = Ue(qx, qy);
-          for (int px = 0; px < P1D; ++px) {
-            V_x[px] += B(px, qx) * Up;
-          }
-        }
-
-        for (int py = 0; py < P1D; ++py) {
-          const CeedScalar w = B(py, qy);
-          for (int px = 0; px < P1D; ++px) {
-            Ve(px, py) += w * V_x[px];
-          }
-        }
-      }
-    }
-
-    inline void TENSOR_FUNCTION(gradElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D),
-                                             const CeedScalar *Ue @dim(P1D, P1D), CeedScalar *Ve_x @dim(Q1D, Q1D), CeedScalar *Ve_y @dim(Q1D, Q1D)) {
-      CeedScalar grad[Q1D][Q1D][2];
-      for (int qy = 0; qy < Q1D; ++qy) {
-        for (int qx = 0; qx < Q1D; ++qx) {
-          grad[qy][qx][0] = 0;
-          grad[qy][qx][1] = 0;
-        }
-      }
-
-      for (int py = 0; py < P1D; ++py) {
-        CeedScalar gradX[Q1D][2];
-        for (int qx = 0; qx < Q1D; ++qx) {
-          gradX[qx][0] = 0;
-          gradX[qx][1] = 0;
-        }
-
-        for (int px = 0; px < P1D; ++px) {
-          const CeedScalar Up = Ue(px, py);
-          for (int qx = 0; qx < Q1D; ++qx) {
-            gradX[qx][0] += Up * B(px, qx);
-            gradX[qx][1] += Up * Bx(px, qx);
-          }
-        }
-
-        for (int qy = 0; qy < Q1D; ++qy) {
-          const CeedScalar wy  = B(py, qy);
-          const CeedScalar wDy = Bx(py, qy);
-          for (int qx = 0; qx < Q1D; ++qx) {
-            const CeedScalar wx  = gradX[qx][0];
-            const CeedScalar wDx = gradX[qx][1];
-            grad[qy][qx][0] += wDx * wy;
-            grad[qy][qx][1] += wx * wDy;
-          }
-        }
-      }
-      for (int qy = 0; qy < Q1D; ++qy) {
-        for (int qx = 0; qx < Q1D; ++qx) {
-          Ve_x(qx, qy) = grad[qy][qx][0];
-          Ve_y(qx, qy) = grad[qy][qx][1];
-        }
-      }
-    }
-
-    inline void TENSOR_FUNCTION(gradElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D),
-                                                      const CeedScalar *Ue_x @dim(Q1D, Q1D), const CeedScalar *Ue_y @dim(Q1D, Q1D),
-                                                      CeedScalar *Ve @dim(P1D, P1D)) {
-      for (int py = 0; py < P1D; ++py) {
-        for (int px = 0; px < P1D; ++px) {
-          Ve(px, py) = 0.0;
-        }
-      }
-
-      for (int qy = 0; qy < Q1D; ++qy) {
-        CeedScalar gradX[P1D][2];
-        for (int px = 0; px < P1D; ++px) {
-          gradX[px][0] = 0;
-          gradX[px][1] = 0;
-        }
-
-        for (int qx = 0; qx < Q1D; ++qx) {
-          const CeedScalar Ux = Ue_x(qx, qy);
-          const CeedScalar Uy = Ue_y(qx, qy);
-          for (int px = 0; px < P1D; ++px) {
-            const CeedScalar wx  = B(px, qx);
-            const CeedScalar wDx = Bx(px, qx);
-            gradX[px][0] += Ux * wDx;
-            gradX[px][1] += Uy * wx;
-          }
-        }
-
-        for (int py = 0; py < P1D; ++py) {
-          const CeedScalar wy  = B(py, qy);
-          const CeedScalar wDy = Bx(py, qy);
-          for (int px = 0; px < P1D; ++px) {
-            Ve(px, py) += ((gradX[px][0] * wy) + (gradX[px][1] * wDy));
-          }
-        }
-      }
-    }
-
-    inline void TENSOR_FUNCTION(weightElement)(const CeedScalar *qWeights1D, CeedScalar *We @dim(Q1D, Q1D)) {
-      for (int qy = 0; qy < Q1D; ++qy) {
-        const CeedScalar wy = qWeights1D[qy];
-        for (int qx = 0; qx < Q1D; ++qx) {
-          We(qx, qy) = qWeights1D[qx] * wy;
-        }
-      }
-    }
-
-);
-
-const char *occa_tensor_basis_2d_cpu_kernel_source = STRINGIFY_SOURCE(
-
-    @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U;
-            CeedScalar       *Ve @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = V;
-
-            TENSOR_FUNCTION(interpElement)(B, &Ue(0, 0, component, element), &Ve(0, 0, element, component));
-          } else {
-            const CeedScalar *Ue @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = U;
-            CeedScalar       *Ve @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V;
-
-            TENSOR_FUNCTION(interpElementTranspose)(B, &Ue(0, 0, element, component), &Ve(0, 0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount)    = U;
-            CeedScalar       *Ve @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2) = V;
-
-            TENSOR_FUNCTION(gradElement)(B, Bx, &Ue(0, 0, component, element), &Ve(0, 0, element, component, 0), &Ve(0, 0, element, component, 1));
-          } else {
-            const CeedScalar *Ue @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2) = U;
-            CeedScalar       *Ve @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount)    = V;
-
-            TENSOR_FUNCTION(gradElementTranspose)
-            (B, Bx, &Ue(0, 0, element, component, 0), &Ue(0, 0, element, component, 1), &Ve(0, 0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, CeedScalar *W @dim(Q1D, Q1D, elementCount)) {
-      @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
-        TENSOR_FUNCTION(weightElement)(qWeights1D, &W(0, 0, element));
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp
deleted file mode 100644
index 942470b85f..0000000000
--- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp
+++ /dev/null
@@ -1,306 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../kernel-defines.hpp"
-
-const char *occa_tensor_basis_3d_cpu_function_source = STRINGIFY_SOURCE(
-
-    @directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_3d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D")
-
-        inline void TENSOR_FUNCTION(interpElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(P1D, P1D, P1D),
-                                                   CeedScalar *Ve @dim(Q1D, Q1D, Q1D)) {
-          for (int qz = 0; qz < Q1D; ++qz) {
-            for (int qy = 0; qy < Q1D; ++qy) {
-              for (int qx = 0; qx < Q1D; ++qx) {
-                Ve(qx, qy, qz) = 0;
-              }
-            }
-          }
-
-          for (int pz = 0; pz < P1D; ++pz) {
-            CeedScalar V_xy[Q1D][Q1D];
-            for (int qy = 0; qy < Q1D; ++qy) {
-              for (int qx = 0; qx < Q1D; ++qx) {
-                V_xy[qy][qx] = 0;
-              }
-            }
-
-            for (int py = 0; py < P1D; ++py) {
-              CeedScalar V_x[Q1D];
-              for (int qx = 0; qx < Q1D; ++qx) {
-                V_x[qx] = 0;
-              }
-
-              for (int px = 0; px < P1D; ++px) {
-                const CeedScalar Up = Ue(px, py, pz);
-                for (int qx = 0; qx < Q1D; ++qx) {
-                  V_x[qx] += B(px, qx) * Up;
-                }
-              }
-
-              for (int qy = 0; qy < Q1D; ++qy) {
-                const CeedScalar wy = B(py, qy);
-                for (int qx = 0; qx < Q1D; ++qx) {
-                  V_xy[qy][qx] += wy * V_x[qx];
-                }
-              }
-            }
-
-            for (int qz = 0; qz < Q1D; ++qz) {
-              const CeedScalar wz = B(pz, qz);
-              for (int qy = 0; qy < Q1D; ++qy) {
-                for (int qx = 0; qx < Q1D; ++qx) {
-                  Ve(qx, qy, qz) += wz * V_xy[qy][qx];
-                }
-              }
-            }
-          }
-        }
-
-    inline void TENSOR_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(Q1D, Q1D, Q1D),
-                                                        CeedScalar *Ve @dim(P1D, P1D, P1D)) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        for (int py = 0; py < P1D; ++py) {
-          for (int px = 0; px < P1D; ++px) {
-            Ve(px, py, pz) = 0;
-          }
-        }
-      }
-
-      for (int qz = 0; qz < Q1D; ++qz) {
-        CeedScalar V_xy[P1D][P1D];
-        for (int py = 0; py < P1D; ++py) {
-          for (int px = 0; px < P1D; ++px) {
-            V_xy[py][px] = 0;
-          }
-        }
-
-        for (int qy = 0; qy < Q1D; ++qy) {
-          CeedScalar V_x[P1D];
-          for (int px = 0; px < P1D; ++px) {
-            V_x[px] = 0;
-          }
-
-          for (int qx = 0; qx < Q1D; ++qx) {
-            const CeedScalar Uq = Ue(qx, qy, qz);
-            for (int px = 0; px < P1D; ++px) {
-              V_x[px] += B(px, qx) * Uq;
-            }
-          }
-
-          for (int py = 0; py < P1D; ++py) {
-            const CeedScalar wy = B(py, qy);
-            for (int px = 0; px < P1D; ++px) {
-              V_xy[py][px] += wy * V_x[px];
-            }
-          }
-        }
-
-        for (int pz = 0; pz < P1D; ++pz) {
-          const CeedScalar wz = B(pz, qz);
-          for (int py = 0; py < P1D; ++py) {
-            for (int px = 0; px < P1D; ++px) {
-              Ve(px, py, pz) += wz * V_xy[py][px];
-            }
-          }
-        }
-      }
-    }
-
-    inline void TENSOR_FUNCTION(gradElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D),
-                                             const CeedScalar *Ue @dim(P1D, P1D, P1D), CeedScalar *Ve_x @dim(Q1D, Q1D, Q1D),
-                                             CeedScalar *Ve_y @dim(Q1D, Q1D, Q1D), CeedScalar *Ve_z @dim(Q1D, Q1D, Q1D)) {
-      for (int qz = 0; qz < Q1D; ++qz) {
-        for (int qy = 0; qy < Q1D; ++qy) {
-          for (int qx = 0; qx < Q1D; ++qx) {
-            Ve_x(qx, qy, qz) = 0;
-            Ve_y(qx, qy, qz) = 0;
-            Ve_z(qx, qy, qz) = 0;
-          }
-        }
-      }
-
-      for (int pz = 0; pz < P1D; ++pz) {
-        CeedScalar gradXY[Q1D][Q1D][3];
-        for (int qy = 0; qy < Q1D; ++qy) {
-          for (int qx = 0; qx < Q1D; ++qx) {
-            gradXY[qy][qx][0] = 0;
-            gradXY[qy][qx][1] = 0;
-            gradXY[qy][qx][2] = 0;
-          }
-        }
-
-        for (int py = 0; py < P1D; ++py) {
-          CeedScalar gradX[Q1D][2];
-          for (int qx = 0; qx < Q1D; ++qx) {
-            gradX[qx][0] = 0;
-            gradX[qx][1] = 0;
-          }
-
-          for (int px = 0; px < P1D; ++px) {
-            const CeedScalar Up = Ue(px, py, pz);
-            for (int qx = 0; qx < Q1D; ++qx) {
-              gradX[qx][0] += Up * B(px, qx);
-              gradX[qx][1] += Up * Bx(px, qx);
-            }
-          }
-
-          for (int qy = 0; qy < Q1D; ++qy) {
-            const CeedScalar wy  = B(py, qy);
-            const CeedScalar wDy = Bx(py, qy);
-            for (int qx = 0; qx < Q1D; ++qx) {
-              const CeedScalar wx  = gradX[qx][0];
-              const CeedScalar wDx = gradX[qx][1];
-              gradXY[qy][qx][0] += wDx * wy;
-              gradXY[qy][qx][1] += wx * wDy;
-              gradXY[qy][qx][2] += wx * wy;
-            }
-          }
-        }
-
-        for (int qz = 0; qz < Q1D; ++qz) {
-          const CeedScalar wz  = B(pz, qz);
-          const CeedScalar wDz = Bx(pz, qz);
-          for (int qy = 0; qy < Q1D; ++qy) {
-            for (int qx = 0; qx < Q1D; ++qx) {
-              Ve_x(qx, qy, qz) += gradXY[qy][qx][0] * wz;
-              Ve_y(qx, qy, qz) += gradXY[qy][qx][1] * wz;
-              Ve_z(qx, qy, qz) += gradXY[qy][qx][2] * wDz;
-            }
-          }
-        }
-      }
-    }
-
-    inline void TENSOR_FUNCTION(gradElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D),
-                                                      const CeedScalar *Ue_x @dim(Q1D, Q1D, Q1D), const CeedScalar *Ue_y @dim(Q1D, Q1D, Q1D),
-                                                      const CeedScalar *Ue_z @dim(Q1D, Q1D, Q1D), CeedScalar *Ve @dim(P1D, P1D, P1D)) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        for (int py = 0; py < P1D; ++py) {
-          for (int px = 0; px < P1D; ++px) {
-            Ve(px, py, pz) = 0;
-          }
-        }
-      }
-
-      for (int qz = 0; qz < Q1D; ++qz) {
-        CeedScalar gradXY[P1D][P1D][3];
-        for (int py = 0; py < P1D; ++py) {
-          for (int px = 0; px < P1D; ++px) {
-            gradXY[py][px][0] = 0;
-            gradXY[py][px][1] = 0;
-            gradXY[py][px][2] = 0;
-          }
-        }
-
-        for (int qy = 0; qy < Q1D; ++qy) {
-          CeedScalar gradX[P1D][3];
-          for (int px = 0; px < P1D; ++px) {
-            gradX[px][0] = 0;
-            gradX[px][1] = 0;
-            gradX[px][2] = 0;
-          }
-
-          for (int qx = 0; qx < Q1D; ++qx) {
-            const CeedScalar Ux = Ue_x(qx, qy, qz);
-            const CeedScalar Uy = Ue_y(qx, qy, qz);
-            const CeedScalar Uz = Ue_z(qx, qy, qz);
-            for (int px = 0; px < P1D; ++px) {
-              const CeedScalar wx  = B(px, qx);
-              const CeedScalar wDx = Bx(px, qx);
-              gradX[px][0] += Ux * wDx;
-              gradX[px][1] += Uy * wx;
-              gradX[px][2] += Uz * wx;
-            }
-          }
-
-          for (int py = 0; py < P1D; ++py) {
-            const CeedScalar wy  = B(py, qy);
-            const CeedScalar wDy = Bx(py, qy);
-            for (int px = 0; px < P1D; ++px) {
-              gradXY[py][px][0] += gradX[px][0] * wy;
-              gradXY[py][px][1] += gradX[px][1] * wDy;
-              gradXY[py][px][2] += gradX[px][2] * wy;
-            }
-          }
-        }
-
-        for (int pz = 0; pz < P1D; ++pz) {
-          const CeedScalar wz  = B(pz, qz);
-          const CeedScalar wDz = Bx(pz, qz);
-          for (int py = 0; py < P1D; ++py) {
-            for (int px = 0; px < P1D; ++px) {
-              Ve(px, py, pz) += ((gradXY[py][px][0] * wz) + (gradXY[py][px][1] * wz) + (gradXY[py][px][2] * wDz));
-            }
-          }
-        }
-      }
-    }
-
-    inline void TENSOR_FUNCTION(weightElement)(const CeedScalar *qWeights1D, CeedScalar *We @dim(Q1D, Q1D, Q1D)) {
-      for (int qz = 0; qz < Q1D; ++qz) {
-        const CeedScalar wz = qWeights1D[qz];
-        for (int qy = 0; qy < Q1D; ++qy) {
-          const CeedScalar wy = qWeights1D[qy];
-          for (int qx = 0; qx < Q1D; ++qx) {
-            We(qx, qy, qz) = qWeights1D[qx] * wy * wz;
-          }
-        }
-      }
-    }
-
-);
-
-const char *occa_tensor_basis_3d_cpu_kernel_source = STRINGIFY_SOURCE(
-
-    @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U;
-            CeedScalar       *Ve @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = V;
-
-            TENSOR_FUNCTION(interpElement)(B, &Ue(0, 0, 0, component, element), &Ve(0, 0, 0, element, component));
-          } else {
-            const CeedScalar *Ue @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = U;
-            CeedScalar       *Ve @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V;
-
-            TENSOR_FUNCTION(interpElementTranspose)(B, &Ue(0, 0, 0, element, component), &Ve(0, 0, 0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          if (!TRANSPOSE) {
-            const CeedScalar *Ue @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount)    = U;
-            CeedScalar       *Ve @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3) = V;
-
-            TENSOR_FUNCTION(gradElement)
-            (B, Bx, &Ue(0, 0, 0, component, element), &Ve(0, 0, 0, element, component, 0), &Ve(0, 0, 0, element, component, 1),
-             &Ve(0, 0, 0, element, component, 2));
-          } else {
-            const CeedScalar *Ue @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3) = U;
-            CeedScalar       *Ve @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount)    = V;
-
-            TENSOR_FUNCTION(gradElementTranspose)
-            (B, Bx, &Ue(0, 0, 0, element, component, 0), &Ue(0, 0, 0, element, component, 1), &Ue(0, 0, 0, element, component, 2),
-             &Ve(0, 0, 0, component, element));
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, CeedScalar *W @dim(Q1D, Q1D, Q1D, elementCount)) {
-      @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
-        TENSOR_FUNCTION(weightElement)(qWeights1D, &W(0, 0, 0, element));
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp
deleted file mode 100644
index 34377f29b9..0000000000
--- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../kernel-defines.hpp"
-
-const char *occa_tensor_basis_1d_gpu_source = STRINGIFY_SOURCE(
-
-    typedef CeedScalar * dofArray @dim(P1D, BASIS_COMPONENT_COUNT, elementCount);
-    typedef const CeedScalar *const_dofArray @dim(P1D, BASIS_COMPONENT_COUNT, elementCount);
-
-    typedef CeedScalar * quadArray @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT);
-    typedef const CeedScalar *const_quadArray @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT);
-
-    typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, ELEMENTS_PER_BLOCK); typedef const CeedScalar *quadToDof @dim(P1D, Q1D);
-    typedef CeedScalar * elementWeightArray @dim(Q1D, elementCount);
-
-    //---[ Utility Methods ]----------------
-    inline void readDofs(const int element, const int localElement, const int component, const int p, const_dofArray U,
-                         sharedBufferArray sharedBuffer) {
-      // Zero out extra entries
-      sharedBuffer(p, localElement) = ((p < P1D) ? U(p, component, element) : 0.0);
-    }
-
-    inline void writeDofs(const int element, const int component, const int p, const CeedScalar Vp, dofArray V) {
-      if (p < P1D) {
-        V(p, component, element) = Vp;
-      }
-    }
-
-    inline void readQuads(const int elementCount, const int element, const int localElement, const int component, const int q, const_quadArray U,
-                          sharedBufferArray sharedBuffer) { sharedBuffer(q, localElement) = U(q, element, component); }
-
-    inline void writeQuads(const int elementCount, const int element, const int component, const int q, const CeedScalar Vq, quadArray V) {
-      V(q, element, component) = Vq;
-    }
-
-    inline void contractX(const int q, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, CeedScalar &V) {
-      V = 0.0;
-      for (int p = 0; p < P1D; ++p) {
-        V += B(p, q) * sharedBuffer(p, localElement);
-      }
-    }
-
-    inline void contractTransposeX(const int p, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, CeedScalar &V) {
-      V = 0.0;
-      for (int q = 0; q < Q1D; ++q) {
-        V += B(p, q) * sharedBuffer(q, localElement);
-      }
-    }
-
-    //---[ Kernels ]------------------------
-    @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar sharedBuffer[MAX_PQ * ELEMENTS_PER_BLOCK];
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) {
-          for (int q = 0; q < Q1D; ++q; @inner) {
-            const int element = elementOffset + localElement;
-            if (element < elementCount) {
-              for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) {
-                CeedScalar r;
-                if (!TRANSPOSE) {
-                  readDofs(element, localElement, component, q, U, sharedBuffer);
-                  contractX(q, localElement, sharedBuffer, B, r);
-                  writeQuads(elementCount, element, component, q, r, V);
-                } else {
-                  readQuads(elementCount, element, localElement, component, q, U, sharedBuffer);
-                  contractTransposeX(q, localElement, sharedBuffer, B, r);
-                  writeDofs(element, component, q, r, V);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar sharedBuffer[MAX_PQ * ELEMENTS_PER_BLOCK];
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) {
-          for (int q = 0; q < Q1D; ++q; @inner) {
-            const int element = elementOffset + localElement;
-            if (element < elementCount) {
-              for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) {
-                CeedScalar r;
-                if (!TRANSPOSE) {
-                  readDofs(element, localElement, component, q, U, sharedBuffer);
-                  contractX(q, localElement, sharedBuffer, Bx, r);
-                  writeQuads(elementCount, element, component, q, r, V);
-                } else {
-                  readQuads(elementCount, element, localElement, component, q, U, sharedBuffer);
-                  contractTransposeX(q, localElement, sharedBuffer, Bx, r);
-                  writeDofs(element, component, q, r, V);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        for (int element = elementOffset; element < (elementOffset + ELEMENTS_PER_BLOCK); ++element; @outer) {
-          for (int q = 0; q < Q1D; ++q; @inner) {
-            W(q, element) = qWeights1D[q];
-          }
-        }
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp
deleted file mode 100644
index 4d99490306..0000000000
--- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../kernel-defines.hpp"
-
-const char *occa_tensor_basis_2d_gpu_source = STRINGIFY_SOURCE(
-
-    typedef CeedScalar * dofArray @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount);
-    typedef const CeedScalar *const_dofArray @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount);
-
-    typedef CeedScalar * quadArray @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2);
-    typedef const CeedScalar *const_quadArray @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2);
-
-    typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, MAX_PQ, ELEMENTS_PER_BLOCK); typedef const CeedScalar *quadToDof @dim(P1D, Q1D);
-    typedef CeedScalar * elementWeightArray @dim(Q1D, Q1D, elementCount);
-
-    //---[ Utility Methods ]----------------
-    inline void readDofs(const int element, const int component, const int px, const int py, const_dofArray U, CeedScalar *Up) {
-      // Zero out extra entries
-      *Up = ((px < P1D) && (py < P1D) ? U(px, py, component, element) : 0.0);
-    }
-
-    inline void writeDofs(const int element, const int component, const int px, const int py, const CeedScalar Vp, dofArray V) {
-      if ((px < P1D) && (py < P1D)) {
-        V(px, py, component, element) = Vp;
-      }
-    }
-
-    inline void readQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim,
-                          const_quadArray U, CeedScalar *Uq) { *Uq = U(qx, qy, element, component, dim); }
-
-    inline void writeQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim,
-                           const CeedScalar Vq, quadArray V) { V(qx, qy, element, component, dim) = Vq; }
-
-    inline void contractX(const int qx, const int qy, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar U,
-                          CeedScalar *V) {
-      sharedBuffer(qx, qy, localElement) = U;
-      *V                                 = 0.0;
-      @barrier();
-      for (int p = 0; p < P1D; ++p) {
-        *V += B(p, qx) * sharedBuffer(p, qy, localElement);
-      }
-      @barrier();
-    }
-
-    inline void contractY(const int qx, const int qy, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar U,
-                          CeedScalar *V) {
-      sharedBuffer(qx, qy, localElement) = U;
-      *V                                 = 0.0;
-      @barrier();
-      for (int p = 0; p < P1D; ++p) {
-        *V += B(p, qy) * sharedBuffer(qx, p, localElement);
-      }
-      @barrier();
-    }
-
-    inline void contractTransposeX(const int px, const int py, const int localElement, sharedBufferArray sharedBuffer, quadToDof B,
-                                   const CeedScalar U, CeedScalar *V) {
-      sharedBuffer(px, py, localElement) = U;
-      *V                                 = 0.0;
-      @barrier();
-      for (int q = 0; q < Q1D; ++q) {
-        *V += B(px, q) * sharedBuffer(q, py, localElement);
-      }
-      @barrier();
-    }
-
-    inline void contractTransposeY(const int px, const int py, const int localElement, sharedBufferArray sharedBuffer, quadToDof B,
-                                   const CeedScalar U, CeedScalar *V) {
-      sharedBuffer(px, py, localElement) = U;
-      *V                                 = 0.0;
-      @barrier();
-      for (int q = 0; q < Q1D; ++q) {
-        *V += B(py, q) * sharedBuffer(px, q, localElement);
-      }
-      @barrier();
-    }
-
-    //---[ Kernels ]------------------------
-    @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * ELEMENTS_PER_BLOCK];
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) {
-          const int element = elementOffset + localElement;
-          for (int qy = 0; qy < Q1D; ++qy; @inner) {
-            for (int qx = 0; qx < Q1D; ++qx; @inner) {
-              if (element < elementCount) {
-                for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) {
-                  CeedScalar r1, r2;
-                  if (!TRANSPOSE) {
-                    readDofs(element, component, qx, qy, U, &r1);
-                    contractX(qx, qy, localElement, sharedBuffer, B, r1, &r2);
-                    contractY(qx, qy, localElement, sharedBuffer, B, r2, &r1);
-                    writeQuads(elementCount, element, component, qx, qy, 0, r1, V);
-                  } else {
-                    readQuads(elementCount, element, component, qx, qy, 0, U, &r1);
-                    contractTransposeY(qx, qy, localElement, sharedBuffer, B, r1, &r2);
-                    contractTransposeX(qx, qy, localElement, sharedBuffer, B, r2, &r1);
-                    writeDofs(element, component, qx, qy, r1, V);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * ELEMENTS_PER_BLOCK];
-
-        for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) {
-          const int element = elementOffset + localElement;
-          for (int qy = 0; qy < Q1D; ++qy; @inner) {
-            for (int qx = 0; qx < Q1D; ++qx; @inner) {
-              if (element < elementCount) {
-                for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) {
-                  CeedScalar r1, r2, r3;
-                  if (!TRANSPOSE) {
-                    readDofs(element, component, qx, qy, U, &r1);
-                    contractX(qx, qy, localElement, sharedBuffer, Bx, r1, &r2);
-                    contractY(qx, qy, localElement, sharedBuffer, B, r2, &r3);
-                    writeQuads(elementCount, element, component, qx, qy, 0, r3, V);
-                    contractX(qx, qy, localElement, sharedBuffer, B, r1, &r2);
-                    contractY(qx, qy, localElement, sharedBuffer, Bx, r2, &r3);
-                    writeQuads(elementCount, element, component, qx, qy, 1, r3, V);
-                  } else {
-                    readQuads(elementCount, element, component, qx, qy, 0, U, &r1);
-                    contractTransposeY(qx, qy, localElement, sharedBuffer, B, r1, &r2);
-                    contractTransposeX(qx, qy, localElement, sharedBuffer, Bx, r2, &r3);
-                    readQuads(elementCount, element, component, qx, qy, 1, U, &r1);
-                    contractTransposeY(qx, qy, localElement, sharedBuffer, Bx, r1, &r2);
-                    contractTransposeX(qx, qy, localElement, sharedBuffer, B, r2, &r1);
-                    writeDofs(element, component, qx, qy, r1 + r3, V);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) {
-      for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) {
-        for (int element = elementOffset; element < (elementOffset + ELEMENTS_PER_BLOCK); ++element; @outer) {
-          for (int qy = 0; qy < Q1D; ++qy; @inner) {
-            for (int qx = 0; qx < Q1D; ++qx; @inner) {
-              W(qx, qy, element) = qWeights1D[qx] * qWeights1D[qy];
-            }
-          }
-        }
-      }
-    }
-
-);
diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp
deleted file mode 100644
index 3b59827a8e..0000000000
--- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../kernel-defines.hpp"
-
-const char *occa_tensor_basis_3d_gpu_source = STRINGIFY_SOURCE(
-
-    typedef CeedScalar * dofArray @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount);
-    typedef const CeedScalar *const_dofArray @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount);
-
-    typedef CeedScalar * quadArray @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3);
-    typedef const CeedScalar *const_quadArray @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3);
-
-    typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, MAX_PQ, BASIS_COMPONENT_COUNT); typedef const CeedScalar *quadToDof @dim(P1D, Q1D);
-    typedef CeedScalar * elementWeightArray @dim(Q1D, Q1D, Q1D, elementCount);
-
-    //---[ Utility Methods ]----------------
-    inline void add(const CeedScalar *U, CeedScalar *V) {
-      for (int q = 0; q < Q1D; q++) {
-        V[q] += U[q];
-      }
-    }
-
-    inline void readDofs(const int element, const int component, const int px, const int py, const_dofArray U, CeedScalar *Up) {
-      // Zero out extra entries
-      for (int pz = 0; pz < P1D; ++pz) {
-        Up[pz] = ((px < P1D) && (py < P1D) ? U(px, py, pz, component, element) : 0.0);
-      }
-      for (int q = P1D; q < Q1D; ++q) {
-        Up[q] = 0.0;
-      }
-    }
-
-    inline void writeDofs(const int element, const int component, const int px, const int py, const CeedScalar *Vp, dofArray V) {
-      if ((px < P1D) && (py < P1D)) {
-        for (int pz = 0; pz < P1D; ++pz) {
-          V(px, py, pz, component, element) = Vp[pz];
-        }
-      }
-    }
-
-    inline void readQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim,
-                          const_quadArray U, CeedScalar *Uq) {
-      for (int qz = 0; qz < Q1D; ++qz) {
-        Uq[qz] = U(qx, qy, qz, element, component, dim);
-      }
-    }
-
-    inline void writeQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim,
-                           const CeedScalar *Vq, quadArray V) {
-      for (int qz = 0; qz < Q1D; ++qz) {
-        V(qx, qy, qz, element, component, dim) = Vq[qz];
-      }
-    }
-
-    inline void contractX(const int qx, const int qy, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Uq,
-                          CeedScalar *Vp) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        sharedBuffer(qx, qy, component) = Uq[pz];
-        Vp[pz]                          = 0.0;
-        @barrier();
-        for (int p = 0; p < P1D; ++p) {
-          Vp[pz] += B(p, qx) * sharedBuffer(p, qy, component);
-        }
-        @barrier();
-      }
-    }
-
-    inline void contractY(const int qx, const int qy, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Uq,
-                          CeedScalar *Vp) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        sharedBuffer(qx, qy, component) = Uq[pz];
-        Vp[pz]                          = 0.0;
-        @barrier();
-        for (int p = 0; p < P1D; ++p) {
-          Vp[pz] += B(p, qy) * sharedBuffer(qx, p, component);
-        }
-        @barrier();
-      }
-    }
-
-    inline void contractZ(const int qx, const int qy, quadToDof B, const CeedScalar *Up, CeedScalar *Vq) {
-      for (int qz = 0; qz < Q1D; ++qz) {
-        Vq[qz] = 0.0;
-        for (int p = 0; p < P1D; ++p) {
-          Vq[qz] += B(p, qz) * Up[p];
-        }
-      }
-    }
-
-    inline void contractTransposeX(const int px, const int py, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Up,
-                                   CeedScalar *Vp) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        sharedBuffer(px, py, component) = Up[pz];
-        Vp[pz]                          = 0.0;
-        @barrier();
-        if (px < P1D) {
-          for (int qx = 0; qx < Q1D; ++qx) {
-            Vp[pz] += B(px, qx) * sharedBuffer(qx, py, component);
-          }
-        }
-        @barrier();
-      }
-    }
-
-    inline void contractTransposeY(const int px, const int py, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Up,
-                                   CeedScalar *Vp) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        sharedBuffer(px, py, component) = Up[pz];
-        Vp[pz]                          = 0.0;
-        @barrier();
-        if (py < P1D) {
-          for (int qy = 0; qy < Q1D; ++qy) {
-            Vp[pz] += B(py, qy) * sharedBuffer(px, qy, component);
-          }
-        }
-        @barrier();
-      }
-    }
-
-    inline void contractTransposeZ(const int px, const int py, quadToDof B, const CeedScalar *Uq, CeedScalar *Vq) {
-      for (int pz = 0; pz < P1D; ++pz) {
-        Vq[pz] = 0.0;
-        for (int qz = 0; qz < Q1D; ++qz) {
-          Vq[pz] += B(pz, qz) * Uq[qz];
-        }
-      }
-    }
-
-    //---[ Kernels ]------------------------
-    @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * BASIS_COMPONENT_COUNT];
-
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          for (int qy = 0; qy < Q1D; ++qy; @inner) {
-            for (int qx = 0; qx < Q1D; ++qx; @inner) {
-              if (element < elementCount) {
-                CeedScalar r1[MAX_PQ], r2[MAX_PQ];
-                for (int q = 0; q < Q1D; ++q) {
-                  r1[q] = 0.0;
-                  r2[q] = 0.0;
-                }
-
-                if (!TRANSPOSE) {
-                  readDofs(element, component, qx, qy, U, r1);
-                  contractX(qx, qy, component, sharedBuffer, B, r1, r2);
-                  contractY(qx, qy, component, sharedBuffer, B, r2, r1);
-                  contractZ(qx, qy, B, r1, r2);
-                  writeQuads(elementCount, element, component, qx, qy, 0, r2, V);
-                } else {
-                  readQuads(elementCount, element, component, qx, qy, 0, U, r1);
-                  contractTransposeZ(qx, qy, B, r1, r2);
-                  contractTransposeY(qx, qy, component, sharedBuffer, B, r2, r1);
-                  contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r2);
-                  writeDofs(element, component, qx, qy, r2, V);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * BASIS_COMPONENT_COUNT];
-
-        for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) {
-          for (int qy = 0; qy < Q1D; ++qy; @inner) {
-            for (int qx = 0; qx < Q1D; ++qx; @inner) {
-              if (element < elementCount) {
-                CeedScalar r1[MAX_PQ], r2[MAX_PQ], r3[MAX_PQ];
-
-                if (!TRANSPOSE) {
-                  readDofs(element, component, qx, qy, U, r1);
-                  // Dx
-                  contractX(qx, qy, component, sharedBuffer, Bx, r1, r2);
-                  contractY(qx, qy, component, sharedBuffer, B, r2, r3);
-                  contractZ(qx, qy, B, r3, r2);
-                  writeQuads(elementCount, element, component, qx, qy, 0, r2, V);
-                  // Dy
-                  contractX(qx, qy, component, sharedBuffer, B, r1, r2);
-                  contractY(qx, qy, component, sharedBuffer, Bx, r2, r3);
-                  contractZ(qx, qy, B, r3, r2);
-                  writeQuads(elementCount, element, component, qx, qy, 1, r2, V);
-                  // Dz
-                  contractX(qx, qy, component, sharedBuffer, B, r1, r2);
-                  contractY(qx, qy, component, sharedBuffer, B, r2, r3);
-                  contractZ(qx, qy, Bx, r3, r2);
-                  writeQuads(elementCount, element, component, qx, qy, 2, r2, V);
-                } else {
-                  // Dx
-                  readQuads(elementCount, element, component, qx, qy, 0, U, r1);
-                  contractTransposeZ(qx, qy, B, r1, r3);
-                  contractTransposeY(qx, qy, component, sharedBuffer, B, r3, r1);
-                  contractTransposeX(qx, qy, component, sharedBuffer, Bx, r1, r2);
-                  // Dy
-                  readQuads(elementCount, element, component, qx, qy, 1, U, r1);
-                  contractTransposeZ(qx, qy, B, r1, r3);
-                  contractTransposeY(qx, qy, component, sharedBuffer, Bx, r3, r1);
-                  contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r3);
-                  add(r3, r2);
-                  // Dz
-                  readQuads(elementCount, element, component, qx, qy, 2, U, r1);
-                  contractTransposeZ(qx, qy, Bx, r1, r3);
-                  contractTransposeY(qx, qy, component, sharedBuffer, B, r3, r1);
-                  contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r3);
-                  add(r3, r2);
-                  writeDofs(element, component, qx, qy, r2, V);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) {
-      for (int element = 0; element < elementCount; ++element; @outer) {
-        for (int qz = 0; qz < Q1D; ++qz; @inner) {
-          for (int qy = 0; qy < Q1D; ++qy; @inner) {
-            for (int qx = 0; qx < Q1D; ++qx) {
-              if (element < elementCount) {
-                W(qx, qy, qz, element) = qWeights1D[qx] * qWeights1D[qy] * qWeights1D[qz];
-              }
-            }
-          }
-        }
-      }
-    }
-
-);
diff --git a/backends/opt/ceed-opt-blocked.c b/backends/opt/ceed-opt-blocked.c
index 6b0125f2fa..e8980c3ba9 100644
--- a/backends/opt/ceed-opt-blocked.c
+++ b/backends/opt/ceed-opt-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -37,6 +37,7 @@ static int CeedInit_Opt_Blocked(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Opt));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Opt));
diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c
index 92f1e7ad07..5f072d1e25 100644
--- a/backends/opt/ceed-opt-operator.c
+++ b/backends/opt/ceed-opt-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,9 +16,9 @@
 //------------------------------------------------------------------------------
 // Setup Input/Output Fields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool is_input, const CeedInt block_size, CeedElemRestriction *block_rstr,
-                                       CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields,
-                                       CeedInt Q) {
+static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, bool *apply_add_basis,
+                                       const CeedInt block_size, CeedElemRestriction *block_rstr, CeedVector *e_vecs_full, CeedVector *e_vecs,
+                                       CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
   Ceed                ceed;
   CeedSize            e_size, q_size;
   CeedInt             num_comp, size, P;
@@ -30,7 +30,8 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
 
     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
     CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
-    if (ceed_parent) ceed = ceed_parent;
+    CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed));
+    CeedCallBackend(CeedDestroy(&ceed_parent));
   }
   if (is_input) {
     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
@@ -101,10 +102,14 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
           CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed_rstr, num_elem, elem_size, block_size, num_comp, l_size, strides,
                                                                   &block_rstr[i + start_e]));
         } break;
+        // LCOV_EXCL_START
         case CEED_RESTRICTION_POINTS:
           // Empty case - won't occur
           break;
+          // LCOV_EXCL_STOP
       }
+      CeedCallBackend(CeedDestroy(&ceed_rstr));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
       CeedCallBackend(CeedElemRestrictionCreateVector(block_rstr[i + start_e], NULL, &e_vecs_full[i + start_e]));
     }
 
@@ -124,6 +129,7 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
         CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         e_size = (CeedSize)P * num_comp * block_size;
         CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i]));
         q_size = (CeedSize)Q * size * block_size;
@@ -134,11 +140,64 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i
         q_size = (CeedSize)Q * block_size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
     }
     // Initialize E-vec arrays
     if (e_vecs[i]) CeedCallBackend(CeedVectorSetValue(e_vecs[i], 0.0));
   }
+  // Drop duplicate restrictions
+  if (is_input) {
+    for (CeedInt i = 0; i < num_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j] = true;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  } else {
+    for (CeedInt i = num_fields - 1; i >= 0; i--) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i - 1; j >= 0; j--) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j]       = true;
+          apply_add_basis[i] = true;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -160,6 +219,7 @@ static int CeedOperatorSetup_Opt(CeedOperator op) {
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedGetData(ceed, &ceed_impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -172,6 +232,9 @@ static int CeedOperatorSetup_Opt(CeedOperator op) {
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->block_rstr));
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));
 
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
@@ -183,11 +246,11 @@ static int CeedOperatorSetup_Opt(CeedOperator op) {
 
   // Set up infield and outfield pointer arrays
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, true, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0,
-                                              num_input_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, true, impl->skip_rstr_in, NULL, block_size, impl->block_rstr, impl->e_vecs_full,
+                                              impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, false, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out,
-                                              num_input_fields, num_output_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, false, impl->skip_rstr_out, impl->apply_add_basis_out, block_size, impl->block_rstr,
+                                              impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q));
 
   // Identity QFunctions
   if (impl->is_identity_qf) {
@@ -206,6 +269,7 @@ static int CeedOperatorSetup_Opt(CeedOperator op) {
   }
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -216,22 +280,23 @@ static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, CeedQFun
                                               CeedVector in_vec, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Opt *impl,
                                               CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    uint64_t     state;
     CeedEvalMode eval_mode;
-    CeedVector   vec;
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      uint64_t   state;
+      CeedVector vec;
+
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       if (vec != CEED_VECTOR_ACTIVE) {
         // Restrict
         CeedCallBackend(CeedVectorGetState(vec, &state));
-        if (state != impl->input_states[i]) {
+        if (state != impl->input_states[i] && impl->block_rstr[i] && !impl->skip_rstr_in[i]) {
           CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request));
-          impl->input_states[i] = state;
         }
+        impl->input_states[i] = state;
         // Get evec
         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data[i]));
       } else {
@@ -242,6 +307,7 @@ static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, CeedQFun
           CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_in[i], (const CeedScalar **)&e_data[i]));
         }
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -254,31 +320,33 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunction
                                              CeedInt num_input_fields, CeedInt block_size, CeedVector in_vec, bool skip_active,
                                              CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Opt *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    bool                is_active_input = false;
+    bool                is_active;
     CeedInt             elem_size, size, num_comp;
     CeedEvalMode        eval_mode;
     CeedVector          vec;
     CeedElemRestriction elem_rstr;
     CeedBasis           basis;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
     // Skip active input
-    is_active_input = vec == CEED_VECTOR_ACTIVE;
-    if (skip_active && is_active_input) continue;
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
+    if (skip_active && is_active) continue;
 
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Restrict block active input
-    if (is_active_input) {
+    if (is_active && impl->block_rstr[i]) {
       CeedCallBackend(CeedElemRestrictionApplyBlock(impl->block_rstr[i], e / block_size, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[i], request));
     }
     // Basis action
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        if (!is_active_input) {
+        if (!is_active) {
           CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][(CeedSize)e * Q * size]));
         }
         break;
@@ -287,11 +355,12 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunction
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        if (!is_active_input) {
+        if (!is_active) {
           CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
           CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][(CeedSize)e * elem_size * num_comp]));
         }
         CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:
         break;  // No action
@@ -304,16 +373,15 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunction
 // Output Basis Action
 //------------------------------------------------------------------------------
 static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields,
-                                              CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op,
-                                              CeedVector out_vec, CeedOperator_Opt *impl, CeedRequest *request) {
+                                              CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, bool *apply_add_basis,
+                                              bool *skip_rstr, CeedOperator op, CeedVector out_vec, CeedOperator_Opt *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
-    CeedBasis           basis;
+    bool         is_active;
+    CeedEvalMode eval_mode;
+    CeedVector   vec;
+    CeedBasis    basis;
 
-    // Get elem_size, eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    // Get eval_mode
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     // Basis action
     switch (eval_mode) {
@@ -324,7 +392,12 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctio
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        if (apply_add_basis[i]) {
+          CeedCallBackend(CeedBasisApplyAdd(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -333,12 +406,15 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctio
       }
     }
     // Restrict output block
+    if (skip_rstr[i]) continue;
     // Get output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    if (is_active) vec = out_vec;
     // Restrict
-    CeedCallBackend(
-        CeedElemRestrictionApplyBlock(impl->block_rstr[i + impl->num_inputs], e / block_size, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request));
+    CeedCallBackend(CeedElemRestrictionApplyBlock(impl->block_rstr[i + impl->num_inputs], e / block_size, CEED_TRANSPOSE, impl->e_vecs_out[i], vec,
+                                                  request));
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -357,6 +433,7 @@ static inline int CeedOperatorRestoreInputs_Opt(CeedInt num_input_fields, CeedQF
     if (eval_mode != CEED_EVAL_WEIGHT && vec != CEED_VECTOR_ACTIVE) {
       CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_full[i], (const CeedScalar **)&e_data[i]));
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -375,20 +452,17 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Opt   *impl;
 
+  // Setup
+  CeedCallBackend(CeedOperatorSetup_Opt(op));
+
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedGetData(ceed, &ceed_impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
   const CeedInt block_size = ceed_impl->block_size;
   const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size);
 
-  // Setup
-  CeedCallBackend(CeedOperatorSetup_Opt(op));
-
   // Restriction only operator
   if (impl->is_identity_rstr_op) {
     for (CeedInt b = 0; b < num_blocks; b++) {
@@ -398,6 +472,11 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect
     return CEED_ERROR_SUCCESS;
   }
 
+  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, in_vec, e_data, impl, request));
 
@@ -416,8 +495,8 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect
   // Loop through elements
   for (CeedInt e = 0; e < num_blocks * block_size; e += block_size) {
     // Input basis apply
-    CeedCallBackend(
-        CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, in_vec, false, e_data, impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, in_vec, false, e_data, impl,
+                                               request));
 
     // Q function
     if (!impl->is_identity_qf) {
@@ -425,12 +504,13 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect
     }
 
     // Output basis apply and restriction
-    CeedCallBackend(CeedOperatorOutputBasis_Opt(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields, op,
-                                                out_vec, impl, request));
+    CeedCallBackend(CeedOperatorOutputBasis_Opt(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields,
+                                                impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec, impl, request));
   }
 
   // Restore input arrays
   CeedCallBackend(CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, e_data, impl));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -441,8 +521,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
                                                               CeedRequest *request) {
   Ceed                ceed;
   Ceed_Opt           *ceed_impl;
-  CeedSize            q_size;
-  CeedInt             Q, num_input_fields, num_output_fields, num_elem, size;
+  CeedInt             qf_size_in, qf_size_out, Q, num_input_fields, num_output_fields, num_elem;
   CeedScalar         *l_vec_array, *e_data[2 * CEED_FIELD_MAX] = {0};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -452,16 +531,17 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedGetData(ceed, &ceed_impl));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
+  qf_size_in  = impl->qf_size_in;
+  qf_size_out = impl->qf_size_out;
+
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  const CeedInt       block_size    = ceed_impl->block_size;
-  const CeedInt       num_blocks    = (num_elem / block_size) + !!(num_elem % block_size);
-  CeedInt             num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
+  const CeedInt       block_size = ceed_impl->block_size;
+  const CeedInt       num_blocks = (num_elem / block_size) + !!(num_elem % block_size);
   CeedVector          l_vec      = impl->qf_l_vec;
-  CeedVector         *active_in  = impl->qf_active_in;
   CeedElemRestriction block_rstr = impl->qf_block_rstr;
 
   // Setup
@@ -474,55 +554,45 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
   CeedCallBackend(CeedOperatorSetupInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, NULL, e_data, impl, request));
 
   // Count number of active input fields
-  if (!num_active_in) {
+  if (qf_size_in == 0) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      CeedScalar *q_vec_array;
-      CeedVector  vec;
+      CeedInt    field_size;
+      CeedVector vec;
 
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array));
-        CeedCallBackend(CeedRealloc(num_active_in + size, &active_in));
-        for (CeedInt field = 0; field < size; field++) {
-          q_size = (CeedSize)Q * block_size;
-          CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field]));
-          CeedCallBackend(
-              CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * Q * block_size]));
-        }
-        num_active_in += size;
-        CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
+        qf_size_in += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
-    impl->num_active_in = num_active_in;
-    impl->qf_active_in  = active_in;
+    CeedCheck(qf_size_in > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_in = qf_size_in;
   }
 
   // Count number of active output fields
-  if (!num_active_out) {
+  if (qf_size_out == 0) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedInt    field_size;
       CeedVector vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
-        num_active_out += size;
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+        qf_size_out += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
-    impl->num_active_out = num_active_out;
+    CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_out = qf_size_out;
   }
 
-  // Check sizes
-  CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-
   // Setup l_vec
   if (!l_vec) {
-    const CeedSize l_size = (CeedSize)block_size * Q * num_active_in * num_active_out;
+    const CeedSize l_size = (CeedSize)block_size * Q * qf_size_in * qf_size_out;
 
     CeedCallBackend(CeedVectorCreate(ceed, l_size, &l_vec));
     CeedCallBackend(CeedVectorSetValue(l_vec, 0.0));
@@ -531,21 +601,21 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
 
   // Output blocked restriction
   if (!block_rstr) {
-    CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q};
+    CeedInt strides[3] = {1, Q, qf_size_in * qf_size_out * Q};
 
-    CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, num_active_in * num_active_out,
-                                                            num_active_in * num_active_out * num_elem * Q, strides, &block_rstr));
+    CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, qf_size_in * qf_size_out,
+                                                            qf_size_in * qf_size_out * num_elem * Q, strides, &block_rstr));
     impl->qf_block_rstr = block_rstr;
   }
 
   // Build objects if needed
   if (build_objects) {
-    const CeedSize l_size     = (CeedSize)num_elem * Q * num_active_in * num_active_out;
-    CeedInt        strides[3] = {1, Q, num_active_in * num_active_out * Q};
+    const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
+    CeedInt        strides[3] = {1, Q, qf_size_in * qf_size_out * Q};
 
     // Create output restriction
-    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, num_active_in * num_active_out, num_active_in * num_active_out * num_elem * Q,
-                                                     strides, rstr));
+    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, qf_size_in * qf_size_out,
+                                                     (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled));
   }
@@ -556,55 +626,82 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
     CeedCallBackend(CeedVectorGetArray(l_vec, CEED_MEM_HOST, &l_vec_array));
 
     // Input basis apply
-    CeedCallBackend(
-        CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, NULL, true, e_data, impl, request));
+    CeedCallBackend(CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, NULL, true, e_data, impl,
+                                               request));
 
     // Assemble QFunction
-    for (CeedInt in = 0; in < num_active_in; in++) {
-      // Set Inputs
-      CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0));
-      if (num_active_in > 1) {
-        CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0));
-      }
-      if (!impl->is_identity_qf) {
-        // Set Outputs
-        for (CeedInt out = 0; out < num_output_fields; out++) {
-          CeedVector vec;
-
-          // Get output vector
-          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
-          // Check if active output
-          if (vec == CEED_VECTOR_ACTIVE) {
-            CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array));
-            CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
-            l_vec_array += size * Q * block_size;  // Advance the pointer by the size of the output
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool       is_active;
+      CeedInt    field_size;
+      CeedVector vec;
+
+      // Check if active input
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active) continue;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
+      for (CeedInt field = 0; field < field_size; field++) {
+        // Set current portion of input to 1.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < Q * block_size; j++) array[field * Q * block_size + j] = 1.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
+        }
+
+        if (!impl->is_identity_qf) {
+          // Set Outputs
+          for (CeedInt out = 0; out < num_output_fields; out++) {
+            CeedVector vec;
+
+            // Check if active output
+            CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
+            if (vec == CEED_VECTOR_ACTIVE) {
+              CeedInt field_size;
+
+              CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array));
+              CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
+              l_vec_array += field_size * Q * block_size;  // Advance the pointer by the size of the output
+            }
+            CeedCallBackend(CeedVectorDestroy(&vec));
           }
+          // Apply QFunction
+          CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out));
+        } else {
+          CeedInt           field_size;
+          const CeedScalar *array;
+
+          // Copy Identity Outputs
+          CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size));
+          CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < field_size * Q * block_size; j++) l_vec_array[j] = array[j];
+          CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &array));
+          l_vec_array += field_size * Q * block_size;
+        }
+        // Reset input to 0.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < Q * block_size; j++) array[field * Q * block_size + j] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
         }
-        // Apply QFunction
-        CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out));
-      } else {
-        const CeedScalar *q_vec_array;
-
-        // Copy Identity Outputs
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &size));
-        CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &q_vec_array));
-        for (CeedInt i = 0; i < size * Q * block_size; i++) l_vec_array[i] = q_vec_array[i];
-        CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &q_vec_array));
-        l_vec_array += size * Q * block_size;
       }
     }
 
-    // Assemble QFunction
+    // Un-set output Qvecs to prevent accidental overwrite of Assembled
     if (!impl->is_identity_qf) {
       for (CeedInt out = 0; out < num_output_fields; out++) {
         CeedVector vec;
 
-        // Get output vector
-        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
         // Check if active output
+        CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
         if (vec == CEED_VECTOR_ACTIVE && num_elem > 0) {
           CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL));
         }
+        CeedCallBackend(CeedVectorDestroy(&vec));
       }
     }
 
@@ -613,18 +710,20 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b
     CeedCallBackend(CeedElemRestrictionApplyBlock(block_rstr, e / block_size, CEED_TRANSPOSE, l_vec, *assembled, request));
   }
 
-  // Un-set output Qvecs to prevent accidental overwrite of Assembled
+  // Reset output Qvecs
   for (CeedInt out = 0; out < num_output_fields; out++) {
     CeedVector vec;
 
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
     // Initialize array if active output
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
     if (vec == CEED_VECTOR_ACTIVE) CeedCallBackend(CeedVectorSetValue(impl->q_vecs_out[out], 0.0));
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Restore input arrays
   CeedCallBackend(CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, e_data, impl));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -656,6 +755,9 @@ static int CeedOperatorDestroy_Opt(CeedOperator op) {
   CeedCallBackend(CeedFree(&impl->block_rstr));
   CeedCallBackend(CeedFree(&impl->e_vecs_full));
   CeedCallBackend(CeedFree(&impl->input_states));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_in));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_out));
+  CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
 
   for (CeedInt i = 0; i < impl->num_inputs; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i]));
@@ -672,10 +774,6 @@ static int CeedOperatorDestroy_Opt(CeedOperator op) {
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
 
   // QFunction assembly data
-  for (CeedInt i = 0; i < impl->num_active_in; i++) {
-    CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
-  }
-  CeedCallBackend(CeedFree(&impl->qf_active_in));
   CeedCallBackend(CeedVectorDestroy(&impl->qf_l_vec));
   CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_block_rstr));
 
@@ -704,6 +802,7 @@ int CeedOperatorCreate_Opt(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Opt));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Opt));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Opt));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/opt/ceed-opt-serial.c b/backends/opt/ceed-opt-serial.c
index ac506a4ec6..1e3517b44a 100644
--- a/backends/opt/ceed-opt-serial.c
+++ b/backends/opt/ceed-opt-serial.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -37,6 +37,7 @@ static int CeedInit_Opt_Serial(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Opt));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Opt));
diff --git a/backends/opt/ceed-opt-tensor.c b/backends/opt/ceed-opt-tensor.c
index a8f5335e35..ee41dce029 100644
--- a/backends/opt/ceed-opt-tensor.c
+++ b/backends/opt/ceed-opt-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/opt/ceed-opt.h b/backends/opt/ceed-opt.h
index 9e12e612bf..a1b67a58e4 100644
--- a/backends/opt/ceed-opt.h
+++ b/backends/opt/ceed-opt.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -21,6 +21,7 @@ typedef struct {
 
 typedef struct {
   bool                 is_identity_qf, is_identity_rstr_op;
+  bool                *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
   CeedElemRestriction *block_rstr;   /* Blocked versions of restrictions */
   CeedVector          *e_vecs_full;  /* Full E-vectors, inputs followed by outputs */
   uint64_t            *input_states; /* State counter of inputs */
@@ -29,8 +30,7 @@ typedef struct {
   CeedVector          *q_vecs_in;    /* Element block input Q-vectors  */
   CeedVector          *q_vecs_out;   /* Element block output Q-vectors */
   CeedInt              num_inputs, num_outputs;
-  CeedInt              num_active_in, num_active_out;
-  CeedVector          *qf_active_in;
+  CeedInt              qf_size_in, qf_size_out;
   CeedVector           qf_l_vec;
   CeedElemRestriction  qf_block_rstr;
 } CeedOperator_Opt;
diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c
index b82e8bb278..d8eef6ce98 100644
--- a/backends/ref/ceed-ref-basis.c
+++ b/backends/ref/ceed-ref-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,17 +16,15 @@
 //------------------------------------------------------------------------------
 // Basis Apply
 //------------------------------------------------------------------------------
-static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) {
-  Ceed               ceed;
-  bool               is_tensor_basis;
+static int CeedBasisApplyCore_Ref(CeedBasis basis, bool apply_add, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U,
+                                  CeedVector V) {
+  bool               is_tensor_basis, add = apply_add || (t_mode == CEED_TRANSPOSE);
   CeedInt            dim, num_comp, q_comp, num_nodes, num_qpts;
-  const CeedInt      add = (t_mode == CEED_TRANSPOSE);
   const CeedScalar  *u;
   CeedScalar        *v;
   CeedTensorContract contract;
   CeedBasis_Ref     *impl;
 
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &impl));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
@@ -35,14 +33,16 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
   CeedCallBackend(CeedBasisGetTensorContract(basis, &contract));
   if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u));
-  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
-  CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v));
-
+  else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
   // Clear v if operating in transpose
-  if (t_mode == CEED_TRANSPOSE) {
-    const CeedInt v_size = num_elem * num_comp * num_nodes;
+  if (apply_add) CeedCallBackend(CeedVectorGetArray(V, CEED_MEM_HOST, &v));
+  else CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v));
+
+  if (t_mode == CEED_TRANSPOSE && !apply_add) {
+    CeedSize len;
 
-    for (CeedInt i = 0; i < v_size; i++) v[i] = (CeedScalar)0.0;
+    CeedCallBackend(CeedVectorGetLength(V, &len));
+    for (CeedInt i = 0; i < len; i++) v[i] = 0.0;
   }
 
   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor_basis));
@@ -55,7 +55,7 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
     switch (eval_mode) {
       // Interpolate to/from quadrature points
       case CEED_EVAL_INTERP: {
-        if (impl->has_collo_interp) {
+        if (impl->is_collocated) {
           memcpy(v, u, num_elem * num_comp * num_nodes * sizeof(u[0]));
         } else {
           CeedInt P = P_1d, Q = Q_1d;
@@ -101,8 +101,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
           //  or Grad to quadrature points (Transpose)
           for (CeedInt d = 0; d < dim; d++) {
             CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? interp_1d : impl->collo_grad_1d), t_mode,
-                                                    add && (d > 0),
-                                                    (t_mode == CEED_NOTRANSPOSE ? (d == 0 ? u : tmp[d % 2]) : u + d * num_qpts * num_comp * num_elem),
+                                                    (t_mode == CEED_TRANSPOSE) && (d > 0),
+                                                    (t_mode == CEED_NOTRANSPOSE ? (d == 0 ? u : tmp[d % 2]) : &u[d * num_qpts * num_comp * num_elem]),
                                                     (t_mode == CEED_NOTRANSPOSE ? (d == dim - 1 ? interp : tmp[(d + 1) % 2]) : interp)));
             pre /= P;
             post *= Q;
@@ -116,14 +116,15 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
           }
           pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem;
           for (CeedInt d = 0; d < dim; d++) {
-            CeedCallBackend(CeedTensorContractApply(
-                contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode, add && (d == dim - 1),
-                (t_mode == CEED_NOTRANSPOSE ? interp : (d == 0 ? interp : tmp[d % 2])),
-                (t_mode == CEED_NOTRANSPOSE ? v + d * num_qpts * num_comp * num_elem : (d == dim - 1 ? v : tmp[(d + 1) % 2]))));
+            CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode,
+                                                    (t_mode == CEED_NOTRANSPOSE && apply_add) || (t_mode == CEED_TRANSPOSE && (d == dim - 1)),
+                                                    (t_mode == CEED_NOTRANSPOSE ? interp : (d == 0 ? interp : tmp[d % 2])),
+                                                    (t_mode == CEED_NOTRANSPOSE ? &v[d * num_qpts * num_comp * num_elem]
+                                                                                : (d == dim - 1 ? v : tmp[(d + 1) % 2]))));
             pre /= P;
             post *= Q;
           }
-        } else if (impl->has_collo_interp) {  // Qpts collocated with nodes
+        } else if (impl->is_collocated) {  // Qpts collocated with nodes
           const CeedScalar *grad_1d;
 
           CeedCallBackend(CeedBasisGetGrad1D(basis, &grad_1d));
@@ -133,8 +134,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
 
           for (CeedInt d = 0; d < dim; d++) {
             CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, grad_1d, t_mode, add && (d > 0),
-                                                    t_mode == CEED_NOTRANSPOSE ? u : u + d * num_comp * num_qpts * num_elem,
-                                                    t_mode == CEED_TRANSPOSE ? v : v + d * num_comp * num_qpts * num_elem));
+                                                    t_mode == CEED_NOTRANSPOSE ? u : &u[d * num_comp * num_qpts * num_elem],
+                                                    t_mode == CEED_TRANSPOSE ? v : &v[d * num_comp * num_qpts * num_elem]));
             pre /= P;
             post *= Q;
           }
@@ -156,8 +157,8 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
             for (CeedInt d = 0; d < dim; d++) {
               CeedCallBackend(CeedTensorContractApply(
                   contract, pre, P, post, Q, (p == d) ? grad_1d : interp_1d, t_mode, add && (d == dim - 1),
-                  (d == 0 ? (t_mode == CEED_NOTRANSPOSE ? u : u + p * num_comp * num_qpts * num_elem) : tmp[d % 2]),
-                  (d == dim - 1 ? (t_mode == CEED_TRANSPOSE ? v : v + p * num_comp * num_qpts * num_elem) : tmp[(d + 1) % 2])));
+                  (d == 0 ? (t_mode == CEED_NOTRANSPOSE ? u : &u[p * num_comp * num_qpts * num_elem]) : tmp[d % 2]),
+                  (d == dim - 1 ? (t_mode == CEED_TRANSPOSE ? v : &v[p * num_comp * num_qpts * num_elem]) : tmp[(d + 1) % 2])));
               pre /= P;
               post *= Q;
             }
@@ -169,7 +170,7 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
         CeedInt           Q = Q_1d;
         const CeedScalar *q_weight_1d;
 
-        CeedCheck(t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
+        CeedCheck(t_mode == CEED_NOTRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
         CeedCallBackend(CeedBasisGetQWeights(basis, &q_weight_1d));
         for (CeedInt d = 0; d < dim; d++) {
           CeedInt pre = CeedIntPow(Q, dim - d - 1), post = CeedIntPow(Q, d);
@@ -188,9 +189,9 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
       // LCOV_EXCL_START
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
-        return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+        return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
       case CEED_EVAL_NONE:
-        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
+        return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
         // LCOV_EXCL_STOP
     }
   } else {
@@ -230,7 +231,7 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
       case CEED_EVAL_WEIGHT: {
         const CeedScalar *q_weight;
 
-        CeedCheck(t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
+        CeedCheck(t_mode == CEED_NOTRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
         CeedCallBackend(CeedBasisGetQWeights(basis, &q_weight));
         for (CeedInt i = 0; i < num_qpts; i++) {
           for (CeedInt e = 0; e < num_elem; e++) v[i * num_elem + e] = q_weight[i];
@@ -238,7 +239,7 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
       } break;
       // LCOV_EXCL_START
       case CEED_EVAL_NONE:
-        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
+        return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
         // LCOV_EXCL_STOP
     }
   }
@@ -249,6 +250,16 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo
   return CEED_ERROR_SUCCESS;
 }
 
+static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) {
+  CeedCallBackend(CeedBasisApplyCore_Ref(basis, false, num_elem, t_mode, eval_mode, U, V));
+  return CEED_ERROR_SUCCESS;
+}
+
+static int CeedBasisApplyAdd_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) {
+  CeedCallBackend(CeedBasisApplyCore_Ref(basis, true, num_elem, t_mode, eval_mode, U, V));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Basis Destroy Tensor
 //------------------------------------------------------------------------------
@@ -274,20 +285,9 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
   CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
 
   CeedCallBackend(CeedCalloc(1, &impl));
-  // Check for collocated interp
-  if (Q_1d == P_1d) {
-    bool has_collocated = true;
-
-    for (CeedInt i = 0; i < P_1d; i++) {
-      has_collocated = has_collocated && (fabs(interp_1d[i + P_1d * i] - 1.0) < 1e-14);
-      for (CeedInt j = 0; j < P_1d; j++) {
-        if (j != i) has_collocated = has_collocated && (fabs(interp_1d[j + P_1d * i]) < 1e-14);
-      }
-    }
-    impl->has_collo_interp = has_collocated;
-  }
   // Calculate collocated grad
-  if (Q_1d >= P_1d && !impl->has_collo_interp) {
+  CeedCallBackend(CeedBasisIsCollocated(basis, &impl->is_collocated));
+  if (Q_1d >= P_1d && !impl->is_collocated) {
     CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &impl->collo_grad_1d));
     CeedCallBackend(CeedBasisGetCollocatedGrad(basis, impl->collo_grad_1d));
   }
@@ -295,9 +295,13 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C
 
   CeedCallBackend(CeedTensorContractCreate(ceed_parent, &contract));
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
+  CeedCallBackend(CeedTensorContractDestroy(&contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyTensor_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -314,8 +318,12 @@ int CeedBasisCreateH1_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes,
 
   CeedCallBackend(CeedTensorContractCreate(ceed_parent, &contract));
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
+  CeedCallBackend(CeedTensorContractDestroy(&contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -332,8 +340,12 @@ int CeedBasisCreateHdiv_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_node
 
   CeedCallBackend(CeedTensorContractCreate(ceed_parent, &contract));
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
+  CeedCallBackend(CeedTensorContractDestroy(&contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -350,8 +362,12 @@ int CeedBasisCreateHcurl_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nod
 
   CeedCallBackend(CeedTensorContractCreate(ceed_parent, &contract));
   CeedCallBackend(CeedBasisSetTensorContract(basis, contract));
+  CeedCallBackend(CeedTensorContractDestroy(&contract));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c
index f525460ea6..326ff93e61 100644
--- a/backends/ref/ceed-ref-operator.c
+++ b/backends/ref/ceed-ref-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,8 +16,9 @@
 //------------------------------------------------------------------------------
 // Setup Input/Output Fields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs_full, CeedVector *e_vecs,
-                                       CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
+static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, CeedInt *e_data_out_indices,
+                                       bool *apply_add_basis, CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e,
+                                       CeedInt num_fields, CeedInt Q) {
   Ceed                ceed;
   CeedSize            e_size, q_size;
   CeedInt             num_comp, size, P;
@@ -29,7 +30,8 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i
 
     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
     CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
-    if (ceed_parent) ceed = ceed_parent;
+    CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed));
+    CeedCallBackend(CeedDestroy(&ceed_parent));
   }
   if (is_input) {
     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
@@ -49,6 +51,7 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i
     if (eval_mode != CEED_EVAL_WEIGHT) {
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
       CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs_full[i + start_e]));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     }
 
     switch (eval_mode) {
@@ -69,15 +72,70 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i
         CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i]));
         q_size = (CeedSize)Q * size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:  // Only on input fields
         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
         q_size = (CeedSize)Q;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         CeedCallBackend(CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
     }
   }
+  // Drop duplicate restrictions
+  if (is_input) {
+    for (CeedInt i = 0; i < num_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j] = true;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  } else {
+    for (CeedInt i = num_fields - 1; i >= 0; i--) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i - 1; j >= 0; j--) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j]          = true;
+          apply_add_basis[i]    = true;
+          e_data_out_indices[j] = i;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -105,6 +163,10 @@ static int CeedOperatorSetup_Ref(CeedOperator op) {
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));
 
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_data_out_indices));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
@@ -116,10 +178,11 @@ static int CeedOperatorSetup_Ref(CeedOperator op) {
 
   // Set up infield and outfield e_vecs and q_vecs
   // Infields
-  CeedCallBackend(CeedOperatorSetupFields_Ref(qf, op, true, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Ref(qf, op, true, impl->skip_rstr_in, NULL, NULL, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0,
+                                              num_input_fields, Q));
   // Outfields
-  CeedCallBackend(
-      CeedOperatorSetupFields_Ref(qf, op, false, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFields_Ref(qf, op, false, impl->skip_rstr_out, impl->e_data_out_indices, impl->apply_add_basis_out,
+                                              impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q));
 
   // Identity QFunctions
   if (impl->is_identity_qf) {
@@ -138,6 +201,7 @@ static int CeedOperatorSetup_Ref(CeedOperator op) {
   }
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -148,14 +212,15 @@ static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, CeedQFun
                                               CeedVector in_vec, const bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX],
                                               CeedOperator_Ref *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    uint64_t            state;
-    CeedEvalMode        eval_mode;
-    CeedVector          vec;
-    CeedElemRestriction elem_rstr;
+    bool         is_active;
+    uint64_t     state;
+    CeedEvalMode eval_mode;
+    CeedVector   vec;
 
     // Get input vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) {
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
       if (skip_active) continue;
       else vec = in_vec;
     }
@@ -167,14 +232,18 @@ static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, CeedQFun
       // Restrict
       CeedCallBackend(CeedVectorGetState(vec, &state));
       // Skip restriction if input is unchanged
-      if (state != impl->input_states[i] || vec == in_vec) {
+      if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
+        CeedElemRestriction elem_rstr;
+
         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
         CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request));
-        impl->input_states[i] = state;
+        CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
       }
+      impl->input_states[i] = state;
       // Get evec
       CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data_full[i]));
     }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -193,14 +262,18 @@ static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunction
 
     // Skip active input
     if (skip_active) {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) continue;
     }
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Basis action
@@ -216,6 +289,7 @@ static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunction
         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
         CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][(CeedSize)e * elem_size * num_comp]));
         CeedCallBackend(CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:
         break;  // No action
@@ -228,7 +302,7 @@ static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunction
 // Output Basis Action
 //------------------------------------------------------------------------------
 static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields,
-                                              CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op,
+                                              CeedInt num_input_fields, CeedInt num_output_fields, bool *apply_add_basis, CeedOperator op,
                                               CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedInt             elem_size, num_comp;
@@ -239,6 +313,7 @@ static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctio
     // Get elem_size, eval_mode
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     // Basis action
     switch (eval_mode) {
@@ -252,7 +327,12 @@ static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctio
         CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
         CeedCallBackend(CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER,
                                            &e_data_full[i + num_input_fields][(CeedSize)e * elem_size * num_comp]));
-        CeedCallBackend(CeedBasisApply(basis, 1, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        if (apply_add_basis[i]) {
+          CeedCallBackend(CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        } else {
+          CeedCallBackend(CeedBasisApply(basis, 1, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -274,10 +354,13 @@ static inline int CeedOperatorRestoreInputs_Ref(CeedInt num_input_fields, CeedQF
 
     // Skip active inputs
     if (skip_active) {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) continue;
     }
     // Restore input
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
@@ -301,33 +384,40 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Ref   *impl;
 
-  CeedCallBackend(CeedOperatorGetData(op, &impl));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-
   // Setup
   CeedCallBackend(CeedOperatorSetup_Ref(op));
 
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+
   // Restriction only operator
   if (impl->is_identity_rstr_op) {
     CeedElemRestriction elem_rstr;
 
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[0], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[0], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     return CEED_ERROR_SUCCESS;
   }
 
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data_full, impl, request));
 
   // Output Evecs
-  for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields]));
+  for (CeedInt i = num_output_fields - 1; i >= 0; i--) {
+    if (impl->skip_rstr_out[i]) {
+      e_data_full[i + num_input_fields] = e_data_full[impl->e_data_out_indices[i] + num_input_fields];
+    } else {
+      CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields]));
+    }
   }
 
   // Loop through elements
@@ -337,8 +427,8 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
       if (eval_mode == CEED_EVAL_NONE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
-        CeedCallBackend(
-            CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][(CeedSize)e * Q * size]));
+        CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER,
+                                           &e_data_full[i + num_input_fields][(CeedSize)e * Q * size]));
       }
     }
 
@@ -351,28 +441,34 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
     }
 
     // Output basis apply
-    CeedCallBackend(
-        CeedOperatorOutputBasis_Ref(e, Q, qf_output_fields, op_output_fields, num_input_fields, num_output_fields, op, e_data_full, impl));
+    CeedCallBackend(CeedOperatorOutputBasis_Ref(e, Q, qf_output_fields, op_output_fields, num_input_fields, num_output_fields,
+                                                impl->apply_add_basis_out, op, e_data_full, impl));
   }
 
   // Output restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool                is_active;
     CeedVector          vec;
     CeedElemRestriction elem_rstr;
 
+    if (impl->skip_rstr_out[i]) continue;
     // Restore Evec
     CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_full[i + impl->num_inputs], &e_data_full[i + num_input_fields]));
     // Get output vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
     // Active
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    if (is_active) vec = out_vec;
     // Restrict
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request));
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
 
   // Restore input arrays
   CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, false, e_data_full, impl));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -381,21 +477,18 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect
 //------------------------------------------------------------------------------
 static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
                                                               CeedRequest *request) {
-  Ceed                ceed, ceed_parent;
-  CeedSize            q_size;
-  CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
+  Ceed                ceed_parent;
+  CeedInt             qf_size_in, qf_size_out, Q, num_elem, num_input_fields, num_output_fields;
   CeedScalar         *assembled_array, *e_data_full[2 * CEED_FIELD_MAX] = {NULL};
-  CeedVector         *active_in;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Ref   *impl;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  active_in     = impl->qf_active_in;
-  num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
+  qf_size_in  = impl->qf_size_in;
+  qf_size_out = impl->qf_size_out;
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
@@ -406,65 +499,58 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
   CeedCallBackend(CeedOperatorSetup_Ref(op));
 
   // Check for restriction only operator
-  CeedCheck(!impl->is_identity_rstr_op, ceed, CEED_ERROR_BACKEND, "Assembling restriction only operators is not supported");
+  CeedCheck(!impl->is_identity_rstr_op, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Assembling restriction only operators is not supported");
 
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request));
 
   // Count number of active input fields
-  if (!num_active_in) {
+  if (qf_size_in == 0) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      CeedScalar *q_vec_array;
-      CeedVector  vec;
+      CeedInt    field_size;
+      CeedVector vec;
 
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
       if (vec == CEED_VECTOR_ACTIVE) {
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array));
-        CeedCallBackend(CeedRealloc(num_active_in + size, &active_in));
-        for (CeedInt field = 0; field < size; field++) {
-          q_size = (CeedSize)Q;
-          CeedCallBackend(CeedVectorCreate(ceed_parent, q_size, &active_in[num_active_in + field]));
-          CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * Q]));
-        }
-        num_active_in += size;
-        CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
+        qf_size_in += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
-    impl->num_active_in = num_active_in;
-    impl->qf_active_in  = active_in;
+    CeedCheck(qf_size_in > 0, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_in = qf_size_in;
   }
 
   // Count number of active output fields
-  if (!num_active_out) {
+  if (qf_size_out == 0) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedInt    field_size;
       CeedVector vec;
 
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
       if (vec == CEED_VECTOR_ACTIVE) {
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
-        num_active_out += size;
+        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
+        qf_size_out += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
-    impl->num_active_out = num_active_out;
+    CeedCheck(qf_size_out > 0, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_out = qf_size_out;
   }
 
-  // Check sizes
-  CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-
   // Build objects if needed
   if (build_objects) {
-    const CeedSize l_size     = (CeedSize)num_elem * Q * num_active_in * num_active_out;
-    CeedInt        strides[3] = {1, Q, num_active_in * num_active_out * Q}; /* *NOPAD* */
+    const CeedSize l_size     = (CeedSize)num_elem * Q * qf_size_in * qf_size_out;
+    CeedInt        strides[3] = {1, Q, qf_size_in * qf_size_out * Q}; /* *NOPAD* */
 
     // Create output restriction
-    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
-                                                     num_active_in * num_active_out * num_elem * Q, strides, rstr));
+    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, qf_size_in * qf_size_out,
+                                                     (CeedSize)qf_size_in * (CeedSize)qf_size_out * (CeedSize)num_elem * (CeedSize)Q, strides, rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
   }
@@ -478,37 +564,66 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
     CeedCallBackend(CeedOperatorInputBasis_Ref(e, Q, qf_input_fields, op_input_fields, num_input_fields, true, e_data_full, impl));
 
     // Assemble QFunction
-    for (CeedInt in = 0; in < num_active_in; in++) {
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool       is_active;
+      CeedInt    field_size;
+      CeedVector vec;
+
       // Set Inputs
-      CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0));
-      if (num_active_in > 1) {
-        CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0));
-      }
-      if (!impl->is_identity_qf) {
-        // Set Outputs
-        for (CeedInt out = 0; out < num_output_fields; out++) {
-          CeedVector vec;
-
-          // Get output vector
-          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
-          // Check if active output
-          if (vec == CEED_VECTOR_ACTIVE) {
-            CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array));
-            CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
-            assembled_array += size * Q;  // Advance the pointer by the size of the output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active) continue;
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
+      for (CeedInt field = 0; field < field_size; field++) {
+        // Set current portion of input to 1.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < Q; j++) array[field * Q + j] = 1.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
+        }
+
+        if (!impl->is_identity_qf) {
+          // Set Outputs
+          for (CeedInt out = 0; out < num_output_fields; out++) {
+            CeedVector vec;
+
+            // Get output vector
+            CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
+            // Check if active output
+            if (vec == CEED_VECTOR_ACTIVE) {
+              CeedInt field_size;
+
+              CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array));
+              CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
+              assembled_array += field_size * Q;  // Advance the pointer by the size of the output
+            }
+            CeedCallBackend(CeedVectorDestroy(&vec));
           }
+          // Apply QFunction
+          CeedCallBackend(CeedQFunctionApply(qf, Q, impl->q_vecs_in, impl->q_vecs_out));
+        } else {
+          CeedInt           field_size;
+          const CeedScalar *array;
+
+          // Copy Identity Outputs
+          CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size));
+          CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < field_size * Q; j++) assembled_array[j] = array[j];
+          CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &array));
+          assembled_array += field_size * Q;
+        }
+        // Reset input to 0.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < Q; j++) array[field * Q + j] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
         }
-        // Apply QFunction
-        CeedCallBackend(CeedQFunctionApply(qf, Q, impl->q_vecs_in, impl->q_vecs_out));
-      } else {
-        const CeedScalar *q_vec_array;
-
-        // Copy Identity Outputs
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &size));
-        CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &q_vec_array));
-        for (CeedInt i = 0; i < size * Q; i++) assembled_array[i] = q_vec_array[i];
-        CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &q_vec_array));
-        assembled_array += size * Q;
       }
     }
   }
@@ -524,6 +639,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
       if (vec == CEED_VECTOR_ACTIVE && num_elem > 0) {
         CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -532,6 +648,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b
 
   // Restore output
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -552,11 +670,12 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Ref(CeedOperator op, CeedVe
 //------------------------------------------------------------------------------
 // Setup Input/Output Fields
 //------------------------------------------------------------------------------
-static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs_full, CeedVector *e_vecs,
-                                               CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) {
+static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, bool *apply_add_basis,
+                                               CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields,
+                                               CeedInt Q) {
   Ceed                ceed;
   CeedSize            e_size, q_size;
-  CeedInt             e_size_padding = 0, max_num_points, num_comp, size, P;
+  CeedInt             max_num_points, num_comp, size, P;
   CeedQFunctionField *qf_fields;
   CeedOperatorField  *op_fields;
 
@@ -565,7 +684,8 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
 
     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
     CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
-    if (ceed_parent) ceed = ceed_parent;
+    CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed));
+    CeedCallBackend(CeedDestroy(&ceed_parent));
   }
   if (is_input) {
     CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
@@ -600,26 +720,11 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {
       CeedElemRestriction elem_rstr;
-      CeedSize            e_size;
-      bool                is_at_points;
 
       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
-      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
-      CeedCallBackend(CeedElemRestrictionIsPoints(elem_rstr, &is_at_points));
-      if (is_at_points) {
-        CeedCallBackend(CeedElemRestrictionGetEVectorSize(elem_rstr, &e_size));
-        if (e_size_padding == 0) {
-          CeedInt num_points, num_elem;
-
-          CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
-          CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(elem_rstr, num_elem - 1, &num_points));
-          e_size_padding = (max_num_points - num_points) * num_comp;
-        }
-        CeedCallBackend(CeedVectorCreate(ceed, e_size + e_size_padding, &e_vecs_full[i + start_e]));
-        CeedCallBackend(CeedVectorSetValue(e_vecs_full[i + start_e], 0.0));
-      } else {
-        CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs_full[i + start_e]));
-      }
+      CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs_full[i + start_e]));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+      CeedCallBackend(CeedVectorSetValue(e_vecs_full[i + start_e], 0.0));
     }
 
     switch (eval_mode) {
@@ -636,6 +741,7 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
           q_size = (CeedSize)max_num_points * size;
           CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         }
+        CeedCallBackend(CeedVectorDestroy(&vec));
         break;
       }
       case CEED_EVAL_INTERP:
@@ -650,19 +756,73 @@ static int CeedOperatorSetupFieldsAtPoints_Ref(CeedQFunction qf, CeedOperator op
         CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i]));
         q_size = (CeedSize)max_num_points * size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:  // Only on input fields
         CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
         q_size = (CeedSize)max_num_points;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
-        CeedCallBackend(
-            CeedBasisApplyAtPoints(basis, max_num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE, q_vecs[i]));
+        CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &max_num_points, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, CEED_VECTOR_NONE,
+                                               q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
     }
     // Initialize full arrays for E-vectors and Q-vectors
     if (e_vecs[i]) CeedCallBackend(CeedVectorSetValue(e_vecs[i], 0.0));
     if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorSetValue(q_vecs[i], 0.0));
   }
+  // Drop duplicate restrictions
+  if (is_input) {
+    for (CeedInt i = 0; i < num_fields; i++) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i + 1; j < num_fields; j++) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j] = true;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  } else {
+    for (CeedInt i = num_fields - 1; i >= 0; i--) {
+      CeedVector          vec_i;
+      CeedElemRestriction rstr_i;
+
+      CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
+      for (CeedInt j = i - 1; j >= 0; j--) {
+        CeedVector          vec_j;
+        CeedElemRestriction rstr_j;
+
+        CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
+        if (vec_i == vec_j && rstr_i == rstr_j) {
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
+          CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i + start_e], &e_vecs_full[j + start_e]));
+          skip_rstr[j]       = true;
+          apply_add_basis[i] = true;
+        }
+        CeedCallBackend(CeedVectorDestroy(&vec_j));
+        CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
+      }
+      CeedCallBackend(CeedVectorDestroy(&vec_i));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
+    }
+  }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -690,6 +850,9 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) {
   // Allocate
   CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));
 
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_out));
+  CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->apply_add_basis_out));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
   CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
@@ -701,10 +864,11 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) {
 
   // Set up infield and outfield pointer arrays
   // Infields
-  CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, true, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, true, impl->skip_rstr_in, NULL, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0,
+                                                      num_input_fields, Q));
   // Outfields
-  CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, false, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields,
-                                                      num_output_fields, Q));
+  CeedCallBackend(CeedOperatorSetupFieldsAtPoints_Ref(qf, op, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs_full,
+                                                      impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q));
 
   // Identity QFunctions
   if (impl->is_identity_qf) {
@@ -713,6 +877,7 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) {
   }
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -721,10 +886,10 @@ static int CeedOperatorSetupAtPoints_Ref(CeedOperator op) {
 //------------------------------------------------------------------------------
 static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_points_offset, CeedInt num_points, CeedQFunctionField *qf_input_fields,
                                                      CeedOperatorField *op_input_fields, CeedInt num_input_fields, CeedVector in_vec,
-                                                     CeedVector point_coords_elem, bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
-                                                     CeedOperator_Ref *impl, CeedRequest *request) {
+                                                     CeedVector point_coords_elem, bool skip_active, bool skip_passive,
+                                                     CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    bool                is_active_input = false;
+    bool                is_active;
     CeedInt             elem_size, size, num_comp;
     CeedRestrictionType rstr_type;
     CeedEvalMode        eval_mode;
@@ -732,10 +897,12 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
     CeedElemRestriction elem_rstr;
     CeedBasis           basis;
 
-    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
     // Skip active input
-    is_active_input = vec == CEED_VECTOR_ACTIVE;
-    if (skip_active && is_active_input) continue;
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
+    if (skip_active && is_active) continue;
+    if (skip_passive && !is_active) continue;
 
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
@@ -743,7 +910,8 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Restrict block active input
-    if (is_active_input) {
+    // When skipping passive inputs, we're doing assembly and should not restrict
+    if (is_active && !impl->skip_rstr_in[i] && !skip_passive) {
       if (rstr_type == CEED_RESTRICTION_POINTS) {
         CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[i], request));
       } else {
@@ -753,7 +921,7 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
     // Basis action
     switch (eval_mode) {
       case CEED_EVAL_NONE:
-        if (!is_active_input) {
+        if (!is_active) {
           CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][num_points_offset * size]));
         }
         break;
@@ -763,17 +931,19 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        if (!is_active_input) {
+        if (!is_active) {
           CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
           CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
           CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][(CeedSize)e * elem_size * num_comp]));
         }
-        CeedCallBackend(
-            CeedBasisApplyAtPoints(basis, num_points, CEED_NOTRANSPOSE, eval_mode, point_coords_elem, impl->e_vecs_in[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_NOTRANSPOSE, eval_mode, point_coords_elem, impl->e_vecs_in[i],
+                                               impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       case CEED_EVAL_WEIGHT:
         break;  // No action
     }
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -783,15 +953,22 @@ static inline int CeedOperatorInputBasisAtPoints_Ref(CeedInt e, CeedInt num_poin
 //------------------------------------------------------------------------------
 static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_points_offset, CeedInt num_points, CeedQFunctionField *qf_output_fields,
                                                       CeedOperatorField *op_output_fields, CeedInt num_input_fields, CeedInt num_output_fields,
-                                                      CeedOperator op, CeedVector out_vec, CeedVector point_coords_elem, CeedOperator_Ref *impl,
-                                                      CeedRequest *request) {
+                                                      bool *apply_add_basis, bool *skip_rstr, CeedOperator op, CeedVector out_vec,
+                                                      CeedVector point_coords_elem, bool skip_passive, CeedOperator_Ref *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool                is_active;
     CeedRestrictionType rstr_type;
     CeedEvalMode        eval_mode;
     CeedVector          vec;
     CeedElemRestriction elem_rstr;
     CeedBasis           basis;
 
+    // Skip active input
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
+    if (skip_passive && !is_active) continue;
+
     // Get elem_size, eval_mode, size
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
@@ -804,8 +981,14 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
       case CEED_EVAL_DIV:
       case CEED_EVAL_CURL:
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(
-            CeedBasisApplyAtPoints(basis, num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i], impl->e_vecs_out[i]));
+        if (apply_add_basis[i]) {
+          CeedCallBackend(CeedBasisApplyAddAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i],
+                                                    impl->e_vecs_out[i]));
+        } else {
+          CeedCallBackend(CeedBasisApplyAtPoints(basis, 1, &num_points, CEED_TRANSPOSE, eval_mode, point_coords_elem, impl->q_vecs_out[i],
+                                                 impl->e_vecs_out[i]));
+        }
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT: {
@@ -814,16 +997,24 @@ static inline int CeedOperatorOutputBasisAtPoints_Ref(CeedInt e, CeedInt num_poi
       }
     }
     // Restrict output block
+    // When skipping passive outputs, we're doing assembly and should not restrict
+    if (skip_rstr[i] || skip_passive) {
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+      continue;
+    }
+
     // Get output vector
     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
     CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
+    if (is_active) vec = out_vec;
     // Restrict
     if (rstr_type == CEED_RESTRICTION_POINTS) {
       CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request));
     } else {
       CeedCallBackend(CeedElemRestrictionApplyBlock(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request));
     }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -863,10 +1054,11 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec,
     // Setup points for element
     CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
     CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points));
+    if (num_points < 1) continue;
 
     // Input basis apply
     CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec,
-                                                       impl->point_coords_elem, false, e_data, impl, request));
+                                                       impl->point_coords_elem, false, false, e_data, impl, request));
 
     // Q function
     if (!impl->is_identity_qf) {
@@ -875,7 +1067,8 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec,
 
     // Output basis apply and restriction
     CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields,
-                                                        num_output_fields, op, out_vec, impl->point_coords_elem, impl, request));
+                                                        num_output_fields, impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec,
+                                                        impl->point_coords_elem, false, impl, request));
 
     num_points_offset += num_points;
   }
@@ -886,6 +1079,7 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec,
   // Cleanup point coordinates
   CeedCallBackend(CeedVectorDestroy(&point_coords));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -895,10 +1089,9 @@ static int CeedOperatorApplyAddAtPoints_Ref(CeedOperator op, CeedVector in_vec,
 static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperator op, bool build_objects, CeedVector *assembled,
                                                                       CeedElemRestriction *rstr, CeedRequest *request) {
   Ceed                ceed;
-  CeedSize            q_size;
-  CeedInt             num_active_in, num_active_out, max_num_points, num_elem, num_input_fields, num_output_fields, num_points_offset = 0;
+  CeedInt             qf_size_in, qf_size_out, max_num_points, num_elem, num_input_fields, num_output_fields, num_points_offset = 0;
   CeedScalar         *assembled_array, *e_data_full[2 * CEED_FIELD_MAX] = {NULL};
-  CeedVector         *active_in, point_coords                           = NULL;
+  CeedVector          point_coords = NULL;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_input_fields, *op_output_fields;
@@ -907,8 +1100,8 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  active_in     = impl->qf_active_in;
-  num_active_in = impl->num_active_in, num_active_out = impl->num_active_out;
+  qf_size_in  = impl->qf_size_in;
+  qf_size_out = impl->qf_size_out;
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
@@ -928,11 +1121,10 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
   CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request));
 
   // Count number of active input fields
-  if (!num_active_in) {
+  if (qf_size_in == 0) {
     for (CeedInt i = 0; i < num_input_fields; i++) {
-      CeedScalar *q_vec_array;
-      CeedInt     field_size;
-      CeedVector  vec;
+      CeedInt    field_size;
+      CeedVector vec;
 
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
@@ -944,32 +1136,25 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
           bool                is_at_points = false;
 
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
-          CeedCallBackend(CeedElemRestrictionIsPoints(elem_rstr, &is_at_points));
+          CeedCallBackend(CeedElemRestrictionIsAtPoints(elem_rstr, &is_at_points));
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
           CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction with active input at points");
         }
         // Get size of active input
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
-        CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
-        CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array));
-        CeedCallBackend(CeedRealloc(num_active_in + field_size, &active_in));
-        for (CeedInt field = 0; field < field_size; field++) {
-          q_size = (CeedSize)max_num_points;
-          CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field]));
-          CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * q_size]));
-        }
-        num_active_in += field_size;
-        CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
+        qf_size_in += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
-    impl->num_active_in = num_active_in;
-    impl->qf_active_in  = active_in;
+    CeedCheck(qf_size_in, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_in = qf_size_in;
   }
 
   // Count number of active output fields
-  if (!num_active_out) {
+  if (qf_size_out == 0) {
     for (CeedInt i = 0; i < num_output_fields; i++) {
-      CeedVector vec;
       CeedInt    field_size;
+      CeedVector vec;
 
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
@@ -981,20 +1166,21 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
           bool                is_at_points = false;
 
           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
-          CeedCallBackend(CeedElemRestrictionIsPoints(elem_rstr, &is_at_points));
+          CeedCallBackend(CeedElemRestrictionIsAtPoints(elem_rstr, &is_at_points));
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
           CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction with active input at points");
         }
         // Get size of active output
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
-        num_active_out += field_size;
+        CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
+        qf_size_out += field_size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
-    impl->num_active_out = num_active_out;
+    CeedCheck(qf_size_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+    impl->qf_size_out = qf_size_out;
   }
 
-  // Check sizes
-  CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
-
   // Build objects if needed
   if (build_objects) {
     CeedInt        num_points_total;
@@ -1004,9 +1190,8 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
 
     // Create output restriction (at points)
     CeedCallBackend(CeedElemRestrictionGetOffsets(rstr_points, CEED_MEM_HOST, &offsets));
-    CeedCallBackend(CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points_total, num_active_in * num_active_out,
-                                                      num_active_in * num_active_out * num_points_total, CEED_MEM_HOST, CEED_COPY_VALUES, offsets,
-                                                      rstr));
+    CeedCallBackend(CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points_total, qf_size_in * qf_size_out,
+                                                      qf_size_in * qf_size_out * num_points_total, CEED_MEM_HOST, CEED_COPY_VALUES, offsets, rstr));
     CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr_points, &offsets));
 
     // Create assembled vector
@@ -1023,45 +1208,73 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
     // Setup points for element
     CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
     CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points));
+    if (num_points < 1) continue;
 
     // Input basis apply
     CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, NULL,
-                                                       impl->point_coords_elem, true, e_data_full, impl, request));
+                                                       impl->point_coords_elem, true, false, e_data_full, impl, request));
 
     // Assemble QFunction
-    for (CeedInt in = 0; in < num_active_in; in++) {
-      // Set Inputs
-      CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0));
-      if (num_active_in > 1) {
-        CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0));
-      }
-      if (!impl->is_identity_qf) {
-        // Set Outputs
-        for (CeedInt out = 0; out < num_output_fields; out++) {
-          CeedVector vec;
-          CeedInt    field_size;
-
-          // Get output vector
-          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
-          // Check if active output
-          if (vec == CEED_VECTOR_ACTIVE) {
-            CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array));
-            CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
-            assembled_array += field_size * num_points;  // Advance the pointer by the size of the output
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool       is_active;
+      CeedInt    field_size;
+      CeedVector vec;
+
+      // Get input vector
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      // Check if active input
+      if (!is_active) continue;
+      // Get size of active input
+      CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &field_size));
+      for (CeedInt field = 0; field < field_size; field++) {
+        // Set current portion of input to 1.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < num_points; j++) array[field * num_points + j] = 1.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
+        }
+
+        if (!impl->is_identity_qf) {
+          // Set Outputs
+          for (CeedInt out = 0; out < num_output_fields; out++) {
+            CeedVector vec;
+            CeedInt    field_size;
+
+            // Get output vector
+            CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
+            // Check if active output
+            if (vec == CEED_VECTOR_ACTIVE) {
+              CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array));
+              CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &field_size));
+              assembled_array += field_size * num_points;  // Advance the pointer by the size of the output
+            }
+            CeedCallBackend(CeedVectorDestroy(&vec));
           }
+          // Apply QFunction
+          CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out));
+        } else {
+          const CeedScalar *array;
+          CeedInt           field_size;
+
+          // Copy Identity Outputs
+          CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size));
+          CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < field_size * num_points; j++) assembled_array[j] = array[j];
+          CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &array));
+          assembled_array += field_size * num_points;
+        }
+        // Reset input to 0.0
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &array));
+          for (CeedInt j = 0; j < num_points; j++) array[field * num_points + j] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &array));
         }
-        // Apply QFunction
-        CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out));
-      } else {
-        const CeedScalar *q_vec_array;
-        CeedInt           field_size;
-
-        // Copy Identity Outputs
-        CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[0], &field_size));
-        CeedCallBackend(CeedVectorGetArrayRead(impl->q_vecs_out[0], CEED_MEM_HOST, &q_vec_array));
-        for (CeedInt i = 0; i < field_size * num_points; i++) assembled_array[i] = q_vec_array[i];
-        CeedCallBackend(CeedVectorRestoreArrayRead(impl->q_vecs_out[0], &q_vec_array));
-        assembled_array += field_size * num_points;
       }
     }
     num_points_offset += num_points;
@@ -1078,6 +1291,7 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
       if (vec == CEED_VECTOR_ACTIVE && num_elem > 0) {
         CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -1088,8 +1302,10 @@ static inline int CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref(CeedOperat
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
 
   // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorDestroy(&point_coords));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1109,8 +1325,414 @@ static int CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref(CeedOperator op
 }
 
 //------------------------------------------------------------------------------
-// Assemble Operator
+// Assemble Operator Diagonal AtPoints
+//------------------------------------------------------------------------------
+static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref(CeedOperator op, CeedVector assembled, CeedRequest *request) {
+  CeedInt             num_points_offset = 0, num_input_fields, num_output_fields, num_elem, num_comp_active = 1;
+  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {0};
+  Ceed                ceed;
+  CeedVector          point_coords = NULL, in_vec, out_vec;
+  CeedElemRestriction rstr_points  = NULL;
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Ref   *impl;
+
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Setup
+  CeedCallBackend(CeedOperatorSetupAtPoints_Ref(op));
+
+  // Ceed
+  {
+    Ceed ceed_parent;
+
+    CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+    CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
+    CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed));
+    CeedCallBackend(CeedDestroy(&ceed_parent));
+  }
+
+  // Point coordinates
+  CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
+
+  // Input and output vectors
+  {
+    CeedSize input_size, output_size;
+
+    CeedCallBackend(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
+    CeedCallBackend(CeedVectorCreate(ceed, input_size, &in_vec));
+    CeedCallBackend(CeedVectorCreate(ceed, output_size, &out_vec));
+    CeedCallBackend(CeedVectorSetValue(out_vec, 0.0));
+  }
+
+  // Clear input Evecs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool       is_active;
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
+    if (!is_active || impl->skip_rstr_in[i]) continue;
+    CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
+  }
+
+  // Input Evecs and Restriction
+  CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));
+
+  // Loop through elements
+  for (CeedInt e = 0; e < num_elem; e++) {
+    CeedInt num_points, e_vec_size = 0;
+
+    // Setup points for element
+    CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem, request));
+    CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points));
+    if (num_points < 1) continue;
+
+    // Input basis apply for non-active bases
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec,
+                                                       impl->point_coords_elem, true, false, e_data, impl, request));
+
+    // Loop over points on element
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool                is_active_at_points = true, is_active;
+      CeedInt             elem_size_active    = 1;
+      CeedRestrictionType rstr_type;
+      CeedVector          vec;
+      CeedElemRestriction elem_rstr;
+
+      // -- Skip non-active input
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active || impl->skip_rstr_in[i]) continue;
+
+      // -- Get active restriction type
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+      is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
+      if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active));
+      else elem_size_active = num_points;
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+
+      e_vec_size = elem_size_active * num_comp_active;
+      for (CeedInt s = 0; s < e_vec_size; s++) {
+        // -- Update unit vector
+        {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
+          array[s] = 1.0;
+          if (s > 0) array[s - 1] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
+        }
+        // Input basis apply for active bases
+        CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields,
+                                                           in_vec, impl->point_coords_elem, false, true, e_data, impl, request));
+
+        // -- Q function
+        if (!impl->is_identity_qf) {
+          CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out));
+        }
+
+        // -- Output basis apply and restriction
+        CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields,
+                                                            num_output_fields, impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec,
+                                                            impl->point_coords_elem, true, impl, request));
+
+        // -- Grab diagonal value
+        for (CeedInt j = 0; j < num_output_fields; j++) {
+          bool                is_active;
+          CeedInt             elem_size = 0;
+          CeedRestrictionType rstr_type;
+          CeedVector          vec;
+          CeedElemRestriction elem_rstr;
+
+          // ---- Skip non-active output
+          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
+          is_active = vec == CEED_VECTOR_ACTIVE;
+          CeedCallBackend(CeedVectorDestroy(&vec));
+          if (!is_active || impl->skip_rstr_out[j]) continue;
+
+          // ---- Check if elem size matches
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
+          CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+          if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) {
+            CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+            continue;
+          }
+          if (rstr_type == CEED_RESTRICTION_POINTS) {
+            CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(elem_rstr, e, &elem_size));
+          } else {
+            CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+          }
+          {
+            CeedInt num_comp = 0;
+
+            CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+            if (e_vec_size != num_comp * elem_size) {
+              CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+              continue;
+            }
+          }
+          // ---- Update output vector
+          {
+            CeedScalar *array, current_value = 0.0;
+
+            CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[j], CEED_MEM_HOST, &array));
+            current_value = array[s];
+            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[j], &array));
+            CeedCallBackend(CeedVectorSetValue(impl->e_vecs_out[j], 0.0));
+            CeedCallBackend(CeedVectorGetArray(impl->e_vecs_out[j], CEED_MEM_HOST, &array));
+            array[s] = current_value;
+            CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[j], &array));
+          }
+          // ---- Restrict output block
+          if (rstr_type == CEED_RESTRICTION_POINTS) {
+            CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[j], assembled, request));
+          } else {
+            CeedCallBackend(CeedElemRestrictionApplyBlock(elem_rstr, e, CEED_TRANSPOSE, impl->e_vecs_out[j], assembled, request));
+          }
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+        }
+        // -- Reset unit vector
+        if (s == e_vec_size - 1) {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
+          array[s] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
+        }
+      }
+    }
+    num_points_offset += num_points;
+  }
+
+  // Restore input arrays
+  CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
+
+  // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedVectorDestroy(&in_vec));
+  CeedCallBackend(CeedVectorDestroy(&out_vec));
+  CeedCallBackend(CeedVectorDestroy(&point_coords));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Assemble Operator AtPoints
 //------------------------------------------------------------------------------
+static int CeedOperatorAssembleSingleAtPoints_Ref(CeedOperator op, CeedInt offset, CeedVector values) {
+  CeedInt             num_points_offset = 0, num_input_fields, num_output_fields, num_elem, num_comp_active = 1;
+  CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {0}, *assembled;
+  Ceed                ceed;
+  CeedVector          point_coords = NULL, in_vec, out_vec;
+  CeedElemRestriction rstr_points  = NULL;
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_input_fields, *op_output_fields;
+  CeedOperator_Ref   *impl;
+
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  // Setup
+  CeedCallBackend(CeedOperatorSetupAtPoints_Ref(op));
+
+  // Ceed
+  {
+    Ceed ceed_parent;
+
+    CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+    CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
+    CeedCallBackend(CeedReferenceCopy(ceed_parent, &ceed));
+    CeedCallBackend(CeedDestroy(&ceed_parent));
+  }
+
+  // Point coordinates
+  CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, &point_coords));
+
+  // Input and output vectors
+  {
+    CeedSize input_size, output_size;
+
+    CeedCallBackend(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
+    CeedCallBackend(CeedVectorCreate(ceed, input_size, &in_vec));
+    CeedCallBackend(CeedVectorCreate(ceed, output_size, &out_vec));
+    CeedCallBackend(CeedVectorSetValue(out_vec, 0.0));
+  }
+
+  // Assembled array
+  CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_HOST, &assembled));
+
+  // Clear input Evecs
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    bool       is_active;
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    CeedCallBackend(CeedVectorDestroy(&vec));
+    if (!is_active || impl->skip_rstr_in[i]) continue;
+    CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
+  }
+
+  // Input Evecs and Restriction
+  CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, CEED_REQUEST_IMMEDIATE));
+
+  // Loop through elements
+  for (CeedInt e = 0; e < num_elem; e++) {
+    CeedInt num_points, e_vec_size = 0;
+
+    // Setup points for element
+    CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement(rstr_points, e, CEED_NOTRANSPOSE, point_coords, impl->point_coords_elem,
+                                                              CEED_REQUEST_IMMEDIATE));
+    CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points));
+    if (num_points < 1) continue;
+
+    // Input basis apply for non-active bases
+    CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields, in_vec,
+                                                       impl->point_coords_elem, true, false, e_data, impl, CEED_REQUEST_IMMEDIATE));
+
+    // Loop over points on element
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      bool                is_active_at_points = true, is_active;
+      CeedInt             elem_size_active    = 1;
+      CeedRestrictionType rstr_type;
+      CeedVector          vec;
+      CeedElemRestriction elem_rstr;
+
+      // -- Skip non-active input
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (!is_active || impl->skip_rstr_in[i]) continue;
+
+      // -- Get active restriction type
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+      is_active_at_points = rstr_type == CEED_RESTRICTION_POINTS;
+      if (!is_active_at_points) CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size_active));
+      else elem_size_active = num_points;
+      CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp_active));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+
+      e_vec_size = elem_size_active * num_comp_active;
+      for (CeedInt s = 0; s < e_vec_size; s++) {
+        const CeedInt comp_in = s / elem_size_active;
+        const CeedInt node_in = s % elem_size_active;
+
+        // -- Update unit vector
+        {
+          CeedScalar *array;
+
+          if (s == 0) CeedCallBackend(CeedVectorSetValue(impl->e_vecs_in[i], 0.0));
+          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
+          array[s] = 1.0;
+          if (s > 0) array[s - 1] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
+        }
+        // Input basis apply for active bases
+        CeedCallBackend(CeedOperatorInputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_input_fields, op_input_fields, num_input_fields,
+                                                           in_vec, impl->point_coords_elem, false, true, e_data, impl, CEED_REQUEST_IMMEDIATE));
+
+        // -- Q function
+        if (!impl->is_identity_qf) {
+          CeedCallBackend(CeedQFunctionApply(qf, num_points, impl->q_vecs_in, impl->q_vecs_out));
+        }
+
+        // -- Output basis apply and restriction
+        CeedCallBackend(CeedOperatorOutputBasisAtPoints_Ref(e, num_points_offset, num_points, qf_output_fields, op_output_fields, num_input_fields,
+                                                            num_output_fields, impl->apply_add_basis_out, impl->skip_rstr_out, op, out_vec,
+                                                            impl->point_coords_elem, true, impl, CEED_REQUEST_IMMEDIATE));
+
+        // -- Build element matrix
+        for (CeedInt j = 0; j < num_output_fields; j++) {
+          bool                is_active;
+          CeedInt             elem_size = 0;
+          CeedRestrictionType rstr_type;
+          CeedVector          vec;
+          CeedElemRestriction elem_rstr;
+
+          // ---- Skip non-active output
+          CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec));
+          is_active = vec == CEED_VECTOR_ACTIVE;
+          CeedCallBackend(CeedVectorDestroy(&vec));
+          if (!is_active || impl->skip_rstr_out[j]) continue;
+
+          // ---- Check if elem size matches
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &elem_rstr));
+          CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
+          if (is_active_at_points && rstr_type != CEED_RESTRICTION_POINTS) {
+            CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+            continue;
+          }
+          if (rstr_type == CEED_RESTRICTION_POINTS) {
+            CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(elem_rstr, e, &elem_size));
+          } else {
+            CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+          }
+          {
+            CeedInt num_comp = 0;
+
+            CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+            CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+            if (e_vec_size != num_comp * elem_size) continue;
+          }
+          // ---- Copy output
+          {
+            const CeedScalar *output;
+
+            CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_out[j], CEED_MEM_HOST, &output));
+            for (CeedInt k = 0; k < e_vec_size; k++) {
+              const CeedInt comp_out = k / elem_size_active;
+              const CeedInt node_out = k % elem_size_active;
+
+              assembled[offset + e * e_vec_size * e_vec_size + (comp_in * num_comp_active + comp_out) * elem_size_active * elem_size_active +
+                        node_out * elem_size_active + node_in] = output[k];
+            }
+            CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_out[j], &output));
+          }
+        }
+        // -- Reset unit vector
+        if (s == e_vec_size - 1) {
+          CeedScalar *array;
+
+          CeedCallBackend(CeedVectorGetArray(impl->e_vecs_in[i], CEED_MEM_HOST, &array));
+          array[s] = 0.0;
+          CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_in[i], &array));
+        }
+      }
+    }
+    num_points_offset += num_points;
+  }
+
+  // Restore input arrays
+  CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl));
+
+  // Restore assembled values
+  CeedCallBackend(CeedVectorRestoreArray(values, &assembled));
+
+  // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedVectorDestroy(&in_vec));
+  CeedCallBackend(CeedVectorDestroy(&out_vec));
+  CeedCallBackend(CeedVectorDestroy(&point_coords));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
+  return CEED_ERROR_SUCCESS;
+}
 
 //------------------------------------------------------------------------------
 // Operator Destroy
@@ -1119,6 +1741,10 @@ static int CeedOperatorDestroy_Ref(CeedOperator op) {
   CeedOperator_Ref *impl;
 
   CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_in));
+  CeedCallBackend(CeedFree(&impl->skip_rstr_out));
+  CeedCallBackend(CeedFree(&impl->e_data_out_indices));
+  CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
   for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
     CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i]));
   }
@@ -1140,12 +1766,6 @@ static int CeedOperatorDestroy_Ref(CeedOperator op) {
   CeedCallBackend(CeedFree(&impl->q_vecs_out));
   CeedCallBackend(CeedVectorDestroy(&impl->point_coords_elem));
 
-  // QFunction assembly
-  for (CeedInt i = 0; i < impl->num_active_in; i++) {
-    CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i]));
-  }
-  CeedCallBackend(CeedFree(&impl->qf_active_in));
-
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
 }
@@ -1164,6 +1784,7 @@ int CeedOperatorCreate_Ref(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1178,10 +1799,13 @@ int CeedOperatorCreateAtPoints_Ref(CeedOperator op) {
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedOperatorSetData(op, impl));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunctionAtPoints_Ref));
-  CeedCallBackend(
-      CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate",
+                                         CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingleAtPoints_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAddAtPoints_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/ref/ceed-ref-qfunction.c b/backends/ref/ceed-ref-qfunction.c
index d2bbd07ad1..caedcbbad1 100644
--- a/backends/ref/ceed-ref-qfunction.c
+++ b/backends/ref/ceed-ref-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -71,6 +71,7 @@ int CeedQFunctionCreate_Ref(CeedQFunction qf) {
   CeedCallBackend(CeedQFunctionSetData(qf, impl));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/ref/ceed-ref-qfunctioncontext.c b/backends/ref/ceed-ref-qfunctioncontext.c
index 9fd2d013db..6c3e500560 100644
--- a/backends/ref/ceed-ref-qfunctioncontext.c
+++ b/backends/ref/ceed-ref-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -131,6 +131,7 @@ int CeedQFunctionContextCreate_Ref(CeedQFunctionContext ctx) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreData", CeedQFunctionContextRestoreData_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreDataRead", CeedQFunctionContextRestoreData_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c
index 08416e4d06..de65e5854b 100644
--- a/backends/ref/ceed-ref-restriction.c
+++ b/backends/ref/ceed-ref-restriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -17,8 +17,8 @@
 // Core ElemRestriction Apply Code
 //------------------------------------------------------------------------------
 static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                      CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size,
-                                                                      CeedSize v_offset, const CeedScalar *__restrict__ uu,
+                                                                      const CeedInt start, const CeedInt stop, const CeedInt num_elem,
+                                                                      const CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
                                                                       CeedScalar *__restrict__ vv) {
   // No offsets provided, identity restriction
   bool has_backend_strides;
@@ -28,8 +28,8 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe
     // CPU backend strides are {1, elem_size, elem_size*num_comp}
     // This if branch is left separate to allow better inlining
     for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-      CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
-        CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
+      for (CeedSize k = 0; k < num_comp; k++) {
+        for (CeedSize n = 0; n < elem_size; n++) {
           CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
             vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
                 uu[n + k * elem_size + CeedIntMin(e + j, num_elem - 1) * elem_size * (CeedSize)num_comp];
@@ -43,8 +43,8 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe
 
     CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
     for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-      CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
-        CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
+      for (CeedSize k = 0; k < num_comp; k++) {
+        for (CeedSize n = 0; n < elem_size; n++) {
           CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
             vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
                 uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * (CeedSize)strides[2]];
@@ -57,15 +57,15 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe
 }
 
 static inline int CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                     const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
-                                                                     CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                     CeedScalar *__restrict__ vv) {
+                                                                     const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
+                                                                     const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
+                                                                     const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Default restriction with offsets
   CeedElemRestriction_Ref *impl;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) {
         vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride];
       }
@@ -75,15 +75,15 @@ static inline int CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(CeedElemRes
 }
 
 static inline int CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                       const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
-                                                                       CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                       CeedScalar *__restrict__ vv) {
+                                                                       const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
+                                                                       const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
+                                                                       const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Restriction with orientations
   CeedElemRestriction_Ref *impl;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) {
         vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] =
             uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0);
@@ -94,15 +94,15 @@ static inline int CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(CeedElemR
 }
 
 static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                           const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
-                                                                           CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                           CeedScalar *__restrict__ vv) {
+                                                                           const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
+                                                                           const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
+                                                                           const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Restriction with tridiagonal transformation
   CeedElemRestriction_Ref *impl;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedSize n = 0;
 
       CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
@@ -112,7 +112,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedE
             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
                 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size];
       }
-      CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
+      for (n = 1; n < elem_size - 1; n++) {
         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
@@ -136,16 +136,16 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedE
 }
 
 static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp,
-                                                                                   const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
-                                                                                   CeedInt stop, CeedInt num_elem, CeedInt elem_size,
-                                                                                   CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                                   CeedScalar *__restrict__ vv) {
+                                                                                   const CeedInt block_size, const CeedInt comp_stride,
+                                                                                   const CeedInt start, const CeedInt stop, const CeedInt num_elem,
+                                                                                   const CeedInt elem_size, const CeedSize v_offset,
+                                                                                   const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Restriction with (unsigned) tridiagonal transformation
   CeedElemRestriction_Ref *impl;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-    CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
+    for (CeedSize k = 0; k < num_comp; k++) {
       CeedSize n = 0;
 
       CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
@@ -155,7 +155,7 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Co
             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
                 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]);
       }
-      CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
+      for (n = 1; n < elem_size - 1; n++) {
         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
@@ -179,9 +179,9 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Co
 }
 
 static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                    CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size,
-                                                                    CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                    CeedScalar *__restrict__ vv) {
+                                                                    const CeedInt start, const CeedInt stop, const CeedInt num_elem,
+                                                                    const CeedInt elem_size, const CeedSize v_offset,
+                                                                    const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // No offsets provided, identity restriction
   bool has_backend_strides;
 
@@ -190,8 +190,8 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest
     // CPU backend strides are {1, elem_size, elem_size*num_comp}
     // This if brach is left separate to allow better inlining
     for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
-      CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
-        CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
+      for (CeedSize k = 0; k < num_comp; k++) {
+        for (CeedSize n = 0; n < elem_size; n++) {
           CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) {
             vv[n + k * elem_size + (e + j) * elem_size * num_comp] += uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset];
           }
@@ -204,8 +204,8 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest
 
     CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
     for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
-      CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
-        CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
+      for (CeedSize k = 0; k < num_comp; k++) {
+        for (CeedSize n = 0; n < elem_size; n++) {
           CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) {
             vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] +=
                 uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset];
@@ -218,9 +218,9 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest
 }
 
 static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                   const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
-                                                                   CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                   CeedScalar *__restrict__ vv) {
+                                                                   const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
+                                                                   const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
+                                                                   const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Default restriction with offsets
   CeedElemRestriction_Ref *impl;
 
@@ -242,9 +242,9 @@ static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestr
 }
 
 static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                     const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
-                                                                     CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                     CeedScalar *__restrict__ vv) {
+                                                                     const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
+                                                                     const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
+                                                                     const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Restriction with orientations
   CeedElemRestriction_Ref *impl;
 
@@ -266,9 +266,9 @@ static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRes
 }
 
 static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                                         const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
-                                                                         CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
-                                                                         CeedScalar *__restrict__ vv) {
+                                                                         const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
+                                                                         const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
+                                                                         const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Restriction with tridiagonal transformation
   CeedElemRestriction_Ref *impl;
   CeedScalar               vv_loc[block_size];
@@ -317,8 +317,9 @@ static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedEle
 }
 
 static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp,
-                                                                                 const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
-                                                                                 CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset,
+                                                                                 const CeedInt block_size, const CeedInt comp_stride,
+                                                                                 const CeedInt start, const CeedInt stop, const CeedInt num_elem,
+                                                                                 const CeedInt elem_size, const CeedSize v_offset,
                                                                                  const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
   // Restriction with (unsigned) tridiagonal transformation
   CeedElemRestriction_Ref *impl;
@@ -367,8 +368,8 @@ static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core
   return CEED_ERROR_SUCCESS;
 }
 
-static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, CeedInt start, CeedInt stop,
-                                                                     CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu,
+static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt start,
+                                                                     const CeedInt stop, CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu,
                                                                      CeedScalar *__restrict__ vv) {
   CeedInt                  num_points, l_vec_offset;
   CeedSize                 e_vec_offset = 0;
@@ -384,7 +385,7 @@ static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRes
       }
     } else {
       for (CeedSize i = 0; i < num_points; i++) {
-        for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] = uu[j * num_points + i + e_vec_offset];
+        for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] += uu[j * num_points + i + e_vec_offset];
       }
     }
     e_vec_offset += num_points * (CeedSize)num_comp;
@@ -393,8 +394,8 @@ static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRes
 }
 
 static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
-                                                    const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs,
-                                                    bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) {
+                                                    const CeedInt comp_stride, const CeedInt start, const CeedInt stop, CeedTransposeMode t_mode,
+                                                    bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) {
   CeedInt             num_elem, elem_size;
   CeedSize            v_offset = 0;
   CeedRestrictionType rstr_type;
@@ -422,8 +423,8 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co
     // Sum into for transpose mode
     switch (rstr_type) {
       case CEED_RESTRICTION_STRIDED:
-        CeedCallBackend(
-            CeedElemRestrictionApplyStridedTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
+        CeedCallBackend(CeedElemRestrictionApplyStridedTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu,
+                                                                          vv));
         break;
       case CEED_RESTRICTION_STANDARD:
         CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size,
@@ -462,8 +463,8 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co
     // Overwrite for notranspose mode
     switch (rstr_type) {
       case CEED_RESTRICTION_STRIDED:
-        CeedCallBackend(
-            CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
+        CeedCallBackend(CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset,
+                                                                            uu, vv));
         break;
       case CEED_RESTRICTION_STANDARD:
         CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size,
@@ -553,6 +554,30 @@ static int CeedElemRestrictionApply_Ref_381(CeedElemRestriction rstr, const Ceed
 }
 
 // LCOV_EXCL_START
+static int CeedElemRestrictionApply_Ref_410(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride,
+                                            CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u,
+                                            CeedVector v, CeedRequest *request) {
+  return CeedElemRestrictionApply_Ref_Core(rstr, 4, 1, comp_stride, start, stop, t_mode, use_signs, use_orients, u, v, request);
+}
+
+static int CeedElemRestrictionApply_Ref_411(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride,
+                                            CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u,
+                                            CeedVector v, CeedRequest *request) {
+  return CeedElemRestrictionApply_Ref_Core(rstr, 4, 1, 1, start, stop, t_mode, use_signs, use_orients, u, v, request);
+}
+
+static int CeedElemRestrictionApply_Ref_480(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride,
+                                            CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u,
+                                            CeedVector v, CeedRequest *request) {
+  return CeedElemRestrictionApply_Ref_Core(rstr, 4, 8, comp_stride, start, stop, t_mode, use_signs, use_orients, u, v, request);
+}
+
+static int CeedElemRestrictionApply_Ref_481(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride,
+                                            CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u,
+                                            CeedVector v, CeedRequest *request) {
+  return CeedElemRestrictionApply_Ref_Core(rstr, 4, 8, 1, start, stop, t_mode, use_signs, use_orients, u, v, request);
+}
+
 static int CeedElemRestrictionApply_Ref_510(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride,
                                             CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u,
                                             CeedVector v, CeedRequest *request) {
@@ -753,20 +778,32 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode,
     }
   }
 
+  // Expand E-vector size for AtPoints
+  if (rstr_type == CEED_RESTRICTION_POINTS) {
+    CeedSize max_points = 0, num_points_total = 0;
+
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points = offsets[i + 1] - offsets[i];
+
+      max_points = CeedIntMax(max_points, num_points);
+      num_points_total += num_points;
+    }
+    // -- Increase size for last element
+    num_points_total += (max_points - (offsets[num_elem] - offsets[num_elem - 1]));
+    CeedCallBackend(CeedElemRestrictionSetAtPointsEVectorSize(rstr, num_points_total * num_comp));
+  }
+
   // Offsets data
   if (rstr_type != CEED_RESTRICTION_STRIDED) {
     const char *resource;
 
     // Check indices for ref or memcheck backends
     {
-      Ceed current = ceed, parent = NULL;
+      Ceed current = ceed, ceed_parent = NULL;
 
-      CeedCallBackend(CeedGetParent(current, &parent));
-      while (current != parent) {
-        current = parent;
-        CeedCallBackend(CeedGetParent(current, &parent));
-      }
-      CeedCallBackend(CeedGetResource(parent, &resource));
+      CeedCallBackend(CeedGetParent(current, &ceed_parent));
+      CeedCallBackend(CeedGetResource(ceed_parent, &resource));
+      CeedCallBackend(CeedDestroy(&ceed_parent));
     }
     if (!strcmp(resource, "/cpu/self/ref/serial") || !strcmp(resource, "/cpu/self/ref/blocked")) {
       CeedSize l_size;
@@ -824,6 +861,18 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode,
       impl->Apply = CeedElemRestrictionApply_Ref_381;
       break;
     // LCOV_EXCL_START
+    case 410:
+      impl->Apply = CeedElemRestrictionApply_Ref_410;
+      break;
+    case 411:
+      impl->Apply = CeedElemRestrictionApply_Ref_411;
+      break;
+    case 480:
+      impl->Apply = CeedElemRestrictionApply_Ref_480;
+      break;
+    case 481:
+      impl->Apply = CeedElemRestrictionApply_Ref_481;
+      break;
     case 510:
       impl->Apply = CeedElemRestrictionApply_Ref_510;
       break;
@@ -856,6 +905,7 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/ref/ceed-ref-tensor.c b/backends/ref/ceed-ref-tensor.c
index a2064cfce6..9d66a2a68d 100644
--- a/backends/ref/ceed-ref-tensor.c
+++ b/backends/ref/ceed-ref-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -51,6 +51,7 @@ int CeedTensorContractCreate_Ref(CeedTensorContract contract) {
   CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed));
   CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Destroy", CeedTensorContractDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/ref/ceed-ref-vector.c b/backends/ref/ceed-ref-vector.c
index f907d232c8..520afdd61a 100644
--- a/backends/ref/ceed-ref-vector.c
+++ b/backends/ref/ceed-ref-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -149,6 +149,7 @@ int CeedVectorCreate_Ref(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArray", CeedVectorRestoreArray_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArrayRead", CeedVectorRestoreArrayRead_Ref));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Ref));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedVectorSetData(vec, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/ref/ceed-ref.c b/backends/ref/ceed-ref.c
index a3c15faf8f..46af219839 100644
--- a/backends/ref/ceed-ref.c
+++ b/backends/ref/ceed-ref.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/ref/ceed-ref.h b/backends/ref/ceed-ref.h
index 8eb3b54331..4af06564a5 100644
--- a/backends/ref/ceed-ref.h
+++ b/backends/ref/ceed-ref.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -11,11 +11,6 @@
 #include <stdbool.h>
 #include <stdint.h>
 
-typedef struct {
-  CeedScalar *collo_grad_1d;
-  bool        has_collo_interp;
-} CeedBasis_Ref;
-
 typedef struct {
   CeedScalar *array;
   CeedScalar *array_borrowed;
@@ -36,6 +31,11 @@ typedef struct {
                CeedRequest *);
 } CeedElemRestriction_Ref;
 
+typedef struct {
+  CeedScalar *collo_grad_1d;
+  bool        is_collocated;
+} CeedBasis_Ref;
+
 typedef struct {
   const CeedScalar **inputs;
   CeedScalar       **outputs;
@@ -49,15 +49,17 @@ typedef struct {
 
 typedef struct {
   bool        is_identity_qf, is_identity_rstr_op;
-  CeedVector *e_vecs_full;  /* Full E-vectors, inputs followed by outputs */
+  bool       *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
+  CeedInt    *e_data_out_indices;
   uint64_t   *input_states; /* State counter of inputs */
+  CeedVector *e_vecs_full;  /* Full E-vectors, inputs followed by outputs */
   CeedVector *e_vecs_in;    /* Single element input E-vectors  */
   CeedVector *e_vecs_out;   /* Single element output E-vectors */
   CeedVector *q_vecs_in;    /* Single element input Q-vectors  */
   CeedVector *q_vecs_out;   /* Single element output Q-vectors */
   CeedInt     num_inputs, num_outputs;
-  CeedInt     num_active_in, num_active_out;
-  CeedVector *qf_active_in, point_coords_elem;
+  CeedInt     qf_size_in, qf_size_out;
+  CeedVector  point_coords_elem;
 } CeedOperator_Ref;
 
 CEED_INTERN int CeedVectorCreate_Ref(CeedSize n, CeedVector vec);
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
index ca469f8d77..b112488569 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
index a4edb6fc2b..ec783e5cc2 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -155,12 +155,12 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         // LCOV_EXCL_STOP
       }
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
   // Check output bases for Q_1d, dim as well
   //   The only input basis might be CEED_BASIS_NONE
   for (CeedInt i = 0; i < num_output_fields; i++) {
     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-
     if (basis != CEED_BASIS_NONE) {
       bool is_tensor;
 
@@ -178,6 +178,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         // LCOV_EXCL_STOP
       }
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
   impl->dim  = dim;
   impl->Q_1d = Q_1d;
@@ -196,6 +197,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
         use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
         was_grad_found                = true;
+        CeedCallBackend(CeedBasisDestroy(&basis));
       }
     }
     for (CeedInt i = 0; i < num_output_fields; i++) {
@@ -205,6 +207,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
         use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
         was_grad_found                = true;
+        CeedCallBackend(CeedBasisDestroy(&basis));
       }
     }
   }
@@ -271,8 +274,9 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
     // Get elem_size, eval_mode, num_comp
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
 
     // Set field constants
     if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -321,6 +325,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
       case CEED_EVAL_CURL:
         break;  // TODO: Not implemented
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
 
   code << "\n  // -- Output field constants and basis data --\n";
@@ -329,8 +334,9 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
     // Get elem_size, eval_mode, num_comp
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
 
     // Set field constants
     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
@@ -382,6 +388,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
       }
         // LCOV_EXCL_STOP
     }
+    CeedCallBackend(CeedBasisDestroy(&basis));
   }
   code << "\n  // -- Element loop --\n";
   code << "  work_group_barrier(CLK_LOCAL_MEM_FENCE);\n";
@@ -394,8 +401,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
     // Get elem_size, eval_mode, num_comp
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
 
     // Restriction
     if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) {
@@ -431,6 +438,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
              << ", num_elem, d_u_" << i << ", r_u_" << i << ");\n";
       }
     }
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
 
     // Basis action
     code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -452,12 +460,14 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
                << i << ", r_t_" << i << ", elem_scratch);\n";
         } else {
           CeedInt P_1d;
+
           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
           CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
           code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n";
           code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i
                << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i
                << ", r_t_" << i << ", elem_scratch);\n";
+          CeedCallBackend(CeedBasisDestroy(&basis));
         }
         break;
       case CEED_EVAL_WEIGHT:
@@ -466,6 +476,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
         impl->W = basis_impl->d_q_weight_1d;
         code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(Q_1D, W, r_t_" << i << ");\n";
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;  // No action
       case CEED_EVAL_DIV:
         break;  // TODO: Not implemented
@@ -544,6 +555,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
                  << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i
                  << ", r_q_" << i << ");\n";
           }
+          CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
           break;
         case CEED_EVAL_INTERP:
           code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
@@ -665,8 +677,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
     // Get elem_size, eval_mode, num_comp
     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     // Basis action
     code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
     switch (eval_mode) {
@@ -690,6 +702,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
           code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i
                << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i
                << ", r_v_" << i << ", elem_scratch);\n";
+          CeedCallBackend(CeedBasisDestroy(&basis));
         }
         break;
       // LCOV_EXCL_START
@@ -734,6 +747,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
       code << "    writeDofsStrided" << dim << "d(num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
            << ", num_elem, r_v_" << i << ", d_v_" << i << ");\n";
     }
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
 
   code << "  }\n";
@@ -766,8 +780,9 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
 
   // Load kernel function
   CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op));
-
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
index 100176b2d7..9370c98d5a 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -39,15 +39,6 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
   CeedOperatorField      *op_input_fields, *op_output_fields;
   CeedOperator_Sycl_gen  *impl;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedGetData(ceed, &ceed_Sycl));
-  CeedCallBackend(CeedOperatorGetData(op, &impl));
-  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
-  CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl));
-  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
-  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-
   // Check for tensor-product bases
   {
     bool has_tensor_bases;
@@ -57,13 +48,22 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
     if (!has_tensor_bases) {
       CeedOperator op_fallback;
 
-      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Falling back to sycl/ref CeedOperator due to non-tensor bases");
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to sycl/ref CeedOperator due to non-tensor bases");
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
     }
   }
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &ceed_Sycl));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl));
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
   // Creation of the operator
   CeedCallBackend(CeedOperatorBuildKernel_Sycl_gen(op));
 
@@ -73,12 +73,15 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
       impl->fields->inputs[i] = NULL;
     } else {
+      bool       is_active;
       CeedVector vec;
 
       // Get input vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = input_vec;
       CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &impl->fields->inputs[i]));
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -88,11 +91,13 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
       impl->fields->outputs[i] = NULL;
     } else {
+      bool       is_active;
       CeedVector vec;
 
       // Get output vector
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = output_vec;
       output_vecs[i] = vec;
       // Check for multiple output modes
       CeedInt index = -1;
@@ -107,6 +112,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
       } else {
         impl->fields->outputs[i] = impl->fields->outputs[index];
       }
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -152,11 +158,14 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = input_vec;
       CeedCallBackend(CeedVectorRestoreArrayRead(vec, &impl->fields->inputs[i]));
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
@@ -165,10 +174,12 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      if (is_active) vec = output_vec;
       // Check for multiple output modes
       CeedInt index = -1;
 
@@ -181,11 +192,14 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
       if (index == -1) {
         CeedCallBackend(CeedVectorRestoreArray(vec, &impl->fields->outputs[i]));
       }
+      if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
     }
   }
 
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_impl->d_c));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -211,6 +225,7 @@ int CeedOperatorCreate_Sycl_gen(CeedOperator op) {
 
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl_gen));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl_gen));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
index 05774e6237..99d1438269 100644
--- a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -38,6 +38,7 @@ static int CeedQFunctionDestroy_Sycl_gen(CeedQFunction qf) {
 
   CeedCallBackend(CeedFree(&impl->qfunction_source));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -61,6 +62,7 @@ int CeedQFunctionCreate_Sycl_gen(CeedQFunction qf) {
 
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Sycl_gen));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl_gen));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-gen/ceed-sycl-gen.hpp b/backends/sycl-gen/ceed-sycl-gen.hpp
index bc1179e4f2..5ba1836197 100644
--- a/backends/sycl-gen/ceed-sycl-gen.hpp
+++ b/backends/sycl-gen/ceed-sycl-gen.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
index 3b274c8348..1d67da7e0a 100644
--- a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -18,10 +18,9 @@
 // Backend init
 //------------------------------------------------------------------------------
 static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
-  Ceed       ceed_shared;
-  Ceed_Sycl *data, *shared_data;
+  Ceed       ceed_shared, ceed_ref;
+  Ceed_Sycl *data;
   char      *resource_root;
-  const char fallback_resource[] = "/gpu/sycl/ref";
 
   CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":device_id=", &resource_root));
   CeedCheck(!strcmp(resource_root, "/gpu/sycl") || !strcmp(resource_root, "/gpu/sycl/gen"), ceed, CEED_ERROR_BACKEND,
@@ -35,12 +34,12 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedInit("/gpu/sycl/shared", &ceed_shared));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
   CeedCallBackend(CeedSetStream_Sycl(ceed_shared, &(data->sycl_queue)));
+  CeedCallBackend(CeedDestroy(&ceed_shared));
 
-  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource));
-
-  Ceed ceed_fallback = NULL;
-  CeedCallBackend(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
-  CeedCallBackend(CeedSetStream_Sycl(ceed_fallback, &(data->sycl_queue)));
+  CeedCallBackend(CeedInit("/gpu/sycl/ref", &ceed_ref));
+  CeedCallBackend(CeedSetOperatorFallbackCeed(ceed, ceed_ref));
+  CeedCallBackend(CeedSetStream_Sycl(ceed_ref, &(data->sycl_queue)));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Sycl_gen));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Sycl_gen));
diff --git a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
index 54c01f0825..508830fffd 100644
--- a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -312,6 +312,7 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran
       }
       break;
     case CEED_EVAL_WEIGHT:
+      CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisApplyWeight_Sycl(data->sycl_queue, num_elem, impl, d_v));
       break;
     case CEED_EVAL_NONE: /* handled separately below */
@@ -327,6 +328,7 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -487,6 +489,7 @@ static int CeedBasisApplyNonTensor_Sycl(CeedBasis basis, const CeedInt num_elem,
       CeedCallBackend(CeedBasisApplyNonTensorGrad_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v));
       break;
     case CEED_EVAL_WEIGHT:
+      CeedCheck(impl->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
       CeedCallBackend(CeedBasisApplyNonTensorWeight_Sycl(data->sycl_queue, num_elem, impl, d_v));
       break;
     case CEED_EVAL_NONE: /* handled separately below */
@@ -502,7 +505,7 @@ static int CeedBasisApplyNonTensor_Sycl(CeedBasis basis, const CeedInt num_elem,
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
-
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -520,11 +523,12 @@ static int CeedBasisDestroy_Sycl(CeedBasis basis) {
   // Wait for all work to finish before freeing memory
   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
 
-  CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
+  if (impl->d_q_weight_1d) CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context));
 
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -542,11 +546,12 @@ static int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) {
   // Wait for all work to finish before freeing memory
   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
 
-  CeedCallSycl(ceed, sycl::free(impl->d_q_weight, data->sycl_context));
+  if (impl->d_q_weight) CeedCallSycl(ceed, sycl::free(impl->d_q_weight, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_interp, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_grad, data->sycl_context));
 
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -555,11 +560,12 @@ static int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) {
 //------------------------------------------------------------------------------
 int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                  const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
-  Ceed ceed;
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  Ceed            ceed;
   CeedBasis_Sycl *impl;
+  Ceed_Sycl      *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
-  Ceed_Sycl *data;
   CeedCallBackend(CeedGetData(ceed, &data));
 
   CeedInt num_comp;
@@ -581,17 +587,23 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
 
   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
 
-  CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
-  sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);
+  std::vector<sycl::event> copy_events;
+  if (q_weight_1d) {
+    CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
+    sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);
+    copy_events.push_back(copy_weight);
+  }
 
   const CeedInt interp_length = Q_1d * P_1d;
   CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
   sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, e);
+  copy_events.push_back(copy_interp);
 
   CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
   sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, e);
+  copy_events.push_back(copy_grad);
 
-  CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
+  CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
 
   std::vector<sycl::kernel_id> kernel_ids = {sycl::get_kernel_id<CeedBasisSyclInterp<1>>(), sycl::get_kernel_id<CeedBasisSyclInterp<0>>(),
                                              sycl::get_kernel_id<CeedBasisSyclGrad<1>>(), sycl::get_kernel_id<CeedBasisSyclGrad<0>>()};
@@ -609,6 +621,7 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApply_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -617,11 +630,12 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
 //------------------------------------------------------------------------------
 int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
                            const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
-  Ceed ceed;
-  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  Ceed                     ceed;
   CeedBasisNonTensor_Sycl *impl;
+  Ceed_Sycl               *data;
+
+  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
-  Ceed_Sycl *data;
   CeedCallBackend(CeedGetData(ceed, &data));
 
   CeedInt num_comp;
@@ -636,24 +650,31 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
 
   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
 
-  CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
-  sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, e);
+  std::vector<sycl::event> copy_events;
+  if (q_weight) {
+    CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
+    sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, e);
+    copy_events.push_back(copy_weight);
+  }
 
   const CeedInt interp_length = num_qpts * num_nodes;
   CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
   sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, e);
+  copy_events.push_back(copy_interp);
 
   const CeedInt grad_length = num_qpts * num_nodes * dim;
   CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context));
   sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, e);
+  copy_events.push_back(copy_grad);
 
-  CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
+  CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index f3cf95641a..45cef53918 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
@@ -23,9 +23,9 @@ class CeedOperatorSyclLinearAssembleFallback;
 //------------------------------------------------------------------------------
 //  Get Basis Emode Pointer
 //------------------------------------------------------------------------------
-void CeedOperatorGetBasisPointer_Sycl(const CeedScalar **basis_ptr, CeedEvalMode e_mode, const CeedScalar *identity, const CeedScalar *interp,
+void CeedOperatorGetBasisPointer_Sycl(const CeedScalar **basis_ptr, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar *interp,
                                       const CeedScalar *grad) {
-  switch (e_mode) {
+  switch (eval_mode) {
     case CEED_EVAL_NONE:
       *basis_ptr = identity;
       break;
@@ -78,21 +78,24 @@ static int CeedOperatorDestroy_Sycl(CeedOperator op) {
 
   // Diag data
   if (impl->diag) {
-    CeedCallBackend(CeedFree(&impl->diag->h_e_mode_in));
-    CeedCallBackend(CeedFree(&impl->diag->h_e_mode_out));
+    CeedCallBackend(CeedFree(&impl->diag->h_eval_mode_in));
+    CeedCallBackend(CeedFree(&impl->diag->h_eval_mode_out));
 
     CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw());
-    CeedCallSycl(ceed, sycl::free(impl->diag->d_e_mode_in, sycl_data->sycl_context));
-    CeedCallSycl(ceed, sycl::free(impl->diag->d_e_mode_out, sycl_data->sycl_context));
+    CeedCallSycl(ceed, sycl::free(impl->diag->d_eval_mode_in, sycl_data->sycl_context));
+    CeedCallSycl(ceed, sycl::free(impl->diag->d_eval_mode_out, sycl_data->sycl_context));
     CeedCallSycl(ceed, sycl::free(impl->diag->d_identity, sycl_data->sycl_context));
     CeedCallSycl(ceed, sycl::free(impl->diag->d_interp_in, sycl_data->sycl_context));
     CeedCallSycl(ceed, sycl::free(impl->diag->d_interp_out, sycl_data->sycl_context));
     CeedCallSycl(ceed, sycl::free(impl->diag->d_grad_in, sycl_data->sycl_context));
     CeedCallSycl(ceed, sycl::free(impl->diag->d_grad_out, sycl_data->sycl_context));
-    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
 
     CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag));
     CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag));
+    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr));
+    CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr));
+    CeedCallBackend(CeedBasisDestroy(&impl->diag->basis_in));
+    CeedCallBackend(CeedBasisDestroy(&impl->diag->basis_out));
   }
   CeedCallBackend(CeedFree(&impl->diag));
 
@@ -104,6 +107,7 @@ static int CeedOperatorDestroy_Sycl(CeedOperator op) {
   CeedCallBackend(CeedFree(&impl->asmb));
 
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -115,7 +119,7 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
   Ceed                ceed;
   CeedSize            q_size;
   bool                is_strided, skip_restriction;
-  CeedInt             dim, size;
+  CeedInt             size;
   CeedOperatorField  *op_fields;
   CeedQFunctionField *qf_fields;
 
@@ -130,46 +134,47 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
 
   // Loop over fields
   for (CeedInt i = 0; i < num_fields; i++) {
-    CeedEvalMode        e_mode;
+    CeedEvalMode        eval_mode;
     CeedVector          vec;
-    CeedElemRestriction rstr;
-    CeedBasis           basis;
+    CeedElemRestriction elem_rstr;
 
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
 
     is_strided       = false;
     skip_restriction = false;
-    if (e_mode != CEED_EVAL_WEIGHT) {
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
+    if (eval_mode != CEED_EVAL_WEIGHT) {
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
 
       // Check whether this field can skip the element restriction:
-      // must be passive input, with  e_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
+      // must be passive input, with  eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND.
 
       // First, check whether the field is input or output:
       if (is_input) {
         // Check for passive input:
         CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
         if (vec != CEED_VECTOR_ACTIVE) {
-          // Check  e_mode
-          if (e_mode == CEED_EVAL_NONE) {
+          // Check  eval_mode
+          if (eval_mode == CEED_EVAL_NONE) {
             // Check for  is_strided restriction
-            CeedCallBackend(CeedElemRestrictionIsStrided(rstr, &is_strided));
+            CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
             if (is_strided) {
               // Check if vector is already in preferred backend ordering
-              CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &skip_restriction));
+              CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction));
             }
           }
         }
+        CeedCallBackend(CeedVectorDestroy(&vec));
       }
       if (skip_restriction) {
         // We do not need an E-Vector, but will use the input field vector's data directly in the operator application
         e_vecs[i + start_e] = NULL;
       } else {
-        CeedCallBackend(CeedElemRestrictionCreateVector(rstr, NULL, &e_vecs[i + start_e]));
+        CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e]));
       }
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     }
 
-    switch (e_mode) {
+    switch (eval_mode) {
       case CEED_EVAL_NONE:
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
         q_size = (CeedSize)num_elem * Q * size;
@@ -181,24 +186,28 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         break;
       case CEED_EVAL_GRAD:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size));
-        CeedCallBackend(CeedBasisGetDimension(basis, &dim));
         q_size = (CeedSize)num_elem * Q * size;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
         break;
-      case CEED_EVAL_WEIGHT:  // Only on input fields
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      case CEED_EVAL_WEIGHT: {
+        CeedBasis basis;
+
+        // Note: only on input fields
         q_size = (CeedSize)num_elem * Q;
         CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i]));
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
         CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
       case CEED_EVAL_DIV:
         break;  // TODO: Not implemented
       case CEED_EVAL_CURL:
         break;  // TODO: Not implemented
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -207,7 +216,6 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool
 // passive) to the named inputs and outputs of its CeedQFunction.
 //------------------------------------------------------------------------------
 static int CeedOperatorSetup_Sycl(CeedOperator op) {
-  Ceed                ceed;
   bool                is_setup_done;
   CeedInt             Q, num_elem, num_input_fields, num_output_fields;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
@@ -218,7 +226,6 @@ static int CeedOperatorSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
   if (is_setup_done) return CEED_ERROR_SUCCESS;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
@@ -242,6 +249,7 @@ static int CeedOperatorSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
 
   CeedCallBackend(CeedOperatorSetSetupDone(op));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -252,35 +260,35 @@ static inline int CeedOperatorSetupInputs_Sycl(CeedInt num_input_fields, CeedQFu
                                                CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
                                                CeedOperator_Sycl *impl, CeedRequest *request) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode        e_mode;
-    CeedVector          vec;
-    CeedElemRestriction rstr;
+    bool         is_active;
+    CeedEvalMode eval_mode;
+    CeedVector   vec;
 
     // Get input vector
     CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-    if (vec == CEED_VECTOR_ACTIVE) {
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    if (is_active) {
       if (skip_active) continue;
       else vec = in_vec;
     }
 
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode));
-    if (e_mode == CEED_EVAL_WEIGHT) {  // Skip
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      // Get input element restriction
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr));
-      if (vec == CEED_VECTOR_ACTIVE) vec = in_vec;
       // Restrict, if necessary
       if (!impl->e_vecs[i]) {
         // No restriction for this field; read data directly from vec.
         CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
       } else {
-        CeedCallBackend(CeedElemRestrictionApply(rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
-        // Get evec
+        CeedElemRestriction elem_rstr;
+
+        CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
+        CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
+        CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
         CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
       }
     }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -292,36 +300,33 @@ static inline int CeedOperatorInputBasis_Sycl(CeedInt num_elem, CeedQFunctionFie
                                               CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX],
                                               CeedOperator_Sycl *impl) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedInt             elem_size, size;
-    CeedElemRestriction rstr;
-    CeedEvalMode        e_mode;
-    CeedBasis           basis;
+    CeedEvalMode eval_mode;
 
     // Skip active input
     if (skip_active) {
+      bool       is_active;
       CeedVector vec;
 
       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      is_active = vec == CEED_VECTOR_ACTIVE;
+      CeedCallBackend(CeedVectorDestroy(&vec));
+      if (is_active) continue;
     }
-    // Get elem_size,  e_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode));
-    CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
     // Basis action
-    switch (e_mode) {
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    switch (eval_mode) {
       case CEED_EVAL_NONE:
         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i]));
         break;
       case CEED_EVAL_INTERP:
+      case CEED_EVAL_GRAD: {
+        CeedBasis basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs[i], impl->q_vecs_in[i]));
-        break;
-      case CEED_EVAL_GRAD:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
       case CEED_EVAL_WEIGHT:
         break;  // No action
       case CEED_EVAL_DIV:
@@ -339,24 +344,26 @@ static inline int CeedOperatorInputBasis_Sycl(CeedInt num_elem, CeedQFunctionFie
 static inline int CeedOperatorRestoreInputs_Sycl(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields,
                                                  const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Sycl *impl) {
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode e_mode;
+    bool         is_active;
+    CeedEvalMode eval_mode;
     CeedVector   vec;
 
+    CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
     // Skip active input
     if (skip_active) {
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
-      if (vec == CEED_VECTOR_ACTIVE) continue;
+      if (is_active) continue;
     }
-    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode));
-    if (e_mode == CEED_EVAL_WEIGHT) {  // Skip
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
       if (!impl->e_vecs[i]) {  // This was a  skip_restriction case
-        CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
         CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i]));
       } else {
         CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i]));
       }
     }
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -404,12 +411,12 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
 
   // Output basis apply if needed
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedElemRestriction rstr;
-    CeedBasis           basis;
+    CeedElemRestriction elem_rstr;
 
-    // Get elem_size,  eval_mode, size
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr));
-    CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
+    // Get elem_size, eval_mode, size
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
     // Basis action
@@ -417,63 +424,60 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVec
       case CEED_EVAL_NONE:
         break;
       case CEED_EVAL_INTERP:
+      case CEED_EVAL_GRAD: {
+        CeedBasis basis;
+
         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in]));
-        break;
-      case CEED_EVAL_GRAD:
-        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in]));
+        CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in]));
+        CeedCallBackend(CeedBasisDestroy(&basis));
         break;
+      }
       // LCOV_EXCL_START
       case CEED_EVAL_WEIGHT:
-        Ceed ceed;
-
-        CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
         break;  // Should not occur
       case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        Ceed ceed;
-
-        CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-        return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
+      case CEED_EVAL_CURL:
+        return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
         break;  // Should not occur
-      }
-        // LCOV_EXCL_STOP
+                // LCOV_EXCL_STOP
     }
   }
 
   // Output restriction
   for (CeedInt i = 0; i < num_output_fields; i++) {
+    bool                is_active;
+    CeedEvalMode        eval_mode;
     CeedVector          vec;
-    CeedElemRestriction rstr;
+    CeedElemRestriction elem_rstr;
 
     // Restore evec
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
     if (eval_mode == CEED_EVAL_NONE) {
       CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_e_in], &e_data[i + num_input_fields]));
     }
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
     // Restrict
-    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr));
-    // Active
-    if (vec == CEED_VECTOR_ACTIVE) vec = out_vec;
-
-    CeedCallBackend(CeedElemRestrictionApply(rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_e_in], vec, request));
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+    is_active = vec == CEED_VECTOR_ACTIVE;
+    if (is_active) vec = out_vec;
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
+    CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_e_in], vec, request));
+    if (!is_active) CeedCallBackend(CeedVectorDestroy(&vec));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
   }
 
   // Restore input arrays
   CeedCallBackend(CeedOperatorRestoreInputs_Sycl(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Core code for assembling linear QFunction
 //------------------------------------------------------------------------------
-static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr,
-                                                               CeedRequest *request) {
-  Ceed                ceed, ceed_parent;
+static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, bool build_objects, CeedVector *assembled,
+                                                               CeedElemRestriction *elem_rstr, CeedRequest *request) {
+  Ceed                ceed_parent;
   CeedSize            q_size;
   CeedInt             num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size;
   CeedScalar         *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL};
@@ -483,7 +487,6 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
   CeedOperatorField  *op_input_fields, *op_output_fields;
   CeedOperator_Sycl  *impl;
 
-  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
@@ -506,9 +509,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
       CeedScalar *q_vec_array;
       CeedVector  vec;
 
-      // Get input vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       // Check if active input
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size));
         CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0));
@@ -516,13 +518,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
         CeedCallBackend(CeedRealloc(num_active_in + size, &active_in));
         for (CeedInt field = 0; field < size; field++) {
           q_size = (CeedSize)Q * num_elem;
-          CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field]));
-          CeedCallBackend(
-              CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem]));
+          CeedCallBackend(CeedVectorCreate(ceed_parent, q_size, &active_in[num_active_in + field]));
+          CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER,
+                                             &q_vec_array[field * Q * num_elem]));
         }
         num_active_in += size;
         CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array));
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     impl->num_active_in = num_active_in;
     impl->qf_active_in  = active_in;
@@ -533,19 +536,20 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
     for (CeedInt i = 0; i < num_output_fields; i++) {
       CeedVector vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       // Check if active output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size));
         num_active_out += size;
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     impl->num_active_out = num_active_out;
   }
 
   // Check sizes
-  CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs");
+  CeedCheck(num_active_in > 0 && num_active_out > 0, CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND,
+            "Cannot assemble QFunction without active inputs and outputs");
 
   // Build objects if needed
   if (build_objects) {
@@ -553,8 +557,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
     CeedInt  strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */
 
     // Create output restriction
-    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out,
-                                                     num_active_in * num_active_out * num_elem * Q, strides, rstr));
+    CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, l_size, strides, elem_rstr));
     // Create assembled vector
     CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled));
   }
@@ -575,14 +578,14 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
     for (CeedInt out = 0; out < num_output_fields; out++) {
       CeedVector vec;
 
-      // Get output vector
-      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
       // Check if active output
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
       if (vec == CEED_VECTOR_ACTIVE) {
         CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array));
         CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size));
         assembled_array += size * Q * num_elem;  // Advance the pointer by the size of the output
       }
+      CeedCallBackend(CeedVectorDestroy(&vec));
     }
     // Apply QFunction
     CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
@@ -592,12 +595,12 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
   for (CeedInt out = 0; out < num_output_fields; out++) {
     CeedVector vec;
 
-    // Get output vector
-    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
     // Check if active output
+    CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
       CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL));
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Restore input arrays
@@ -605,21 +608,24 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
 
   // Restore output
   CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array));
+  CeedCallBackend(CeedDestroy(&ceed_parent));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Assemble Linear QFunction
 //------------------------------------------------------------------------------
-static int CeedOperatorLinearAssembleQFunction_Sycl(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
-  return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, true, assembled, rstr, request);
+static int CeedOperatorLinearAssembleQFunction_Sycl(CeedOperator op, CeedVector *assembled, CeedElemRestriction *elem_rstr, CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, true, assembled, elem_rstr, request);
 }
 
 //------------------------------------------------------------------------------
 // Update Assembled Linear QFunction
 //------------------------------------------------------------------------------
-static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) {
-  return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, false, &assembled, &rstr, request);
+static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedVector assembled, CeedElemRestriction elem_rstr,
+                                                          CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionCore_Sycl(op, false, &assembled, &elem_rstr, request);
 }
 
 //------------------------------------------------------------------------------
@@ -628,10 +634,10 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedV
 static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   Ceed                ceed;
   Ceed_Sycl          *sycl_data;
-  CeedInt             num_input_fields, num_output_fields, num_e_mode_in = 0, num_comp = 0, dim = 1, num_e_mode_out = 0;
-  CeedEvalMode       *e_mode_in = NULL, *e_mode_out = NULL;
-  CeedBasis           basis_in = NULL, basis_out = NULL;
+  CeedInt             num_input_fields, num_output_fields, num_eval_mode_in = 0, num_comp = 0, dim = 1, num_eval_mode_out = 0;
+  CeedEvalMode       *eval_mode_in = NULL, *eval_mode_out = NULL;
   CeedElemRestriction rstr_in = NULL, rstr_out = NULL;
+  CeedBasis           basis_in = NULL, basis_out = NULL;
   CeedQFunctionField *qf_fields;
   CeedQFunction       qf;
   CeedOperatorField  *op_fields;
@@ -649,28 +655,31 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedEvalMode        e_mode;
-      CeedElemRestriction rstr;
-
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in));
-      CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
+      if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in));
+      CeedCheck(rstr_in == elem_rstr, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly");
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCheck(basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator diagonal assembly with multiple active bases");
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedBasisGetDimension(basis_in, &dim));
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
-      CeedCheck(!rstr_in || rstr_in == rstr, ceed, CEED_ERROR_BACKEND,
-                "Backend does not implement multi-field non-composite operator diagonal assembly");
-      rstr_in = rstr;
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode));
-      switch (e_mode) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
+      switch (eval_mode) {
         case CEED_EVAL_NONE:
         case CEED_EVAL_INTERP:
-          CeedCallBackend(CeedRealloc(num_e_mode_in + 1, &e_mode_in));
-          e_mode_in[num_e_mode_in] = e_mode;
-          num_e_mode_in += 1;
+          CeedCallBackend(CeedRealloc(num_eval_mode_in + 1, &eval_mode_in));
+          eval_mode_in[num_eval_mode_in] = eval_mode;
+          num_eval_mode_in += 1;
           break;
         case CEED_EVAL_GRAD:
-          CeedCallBackend(CeedRealloc(num_e_mode_in + dim, &e_mode_in));
-          for (CeedInt d = 0; d < dim; d++) e_mode_in[num_e_mode_in + d] = e_mode;
-          num_e_mode_in += dim;
+          CeedCallBackend(CeedRealloc(num_eval_mode_in + dim, &eval_mode_in));
+          for (CeedInt d = 0; d < dim; d++) eval_mode_in[num_eval_mode_in + d] = eval_mode;
+          num_eval_mode_in += dim;
           break;
         case CEED_EVAL_WEIGHT:
         case CEED_EVAL_DIV:
@@ -678,7 +687,9 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
           break;  // Caught by QF Assembly
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
 
   // Determine active output basis
   CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
@@ -688,26 +699,30 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
 
     CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedEvalMode        e_mode;
-      CeedElemRestriction rstr;
-
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out));
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
-      CeedCheck(!rstr_out || rstr_out == rstr, ceed, CEED_ERROR_BACKEND,
-                "Backend does not implement multi-field non-composite operator diagonal assembly");
-      rstr_out = rstr;
-      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode));
-      switch (e_mode) {
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr));
+      if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out));
+      CeedCheck(rstr_out == elem_rstr, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly");
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCheck(basis_out == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator diagonal assembly with multiple active bases");
+      CeedCallBackend(CeedBasisDestroy(&basis));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
+      switch (eval_mode) {
         case CEED_EVAL_NONE:
         case CEED_EVAL_INTERP:
-          CeedCallBackend(CeedRealloc(num_e_mode_out + 1, &e_mode_out));
-          e_mode_out[num_e_mode_out] = e_mode;
-          num_e_mode_out += 1;
+          CeedCallBackend(CeedRealloc(num_eval_mode_out + 1, &eval_mode_out));
+          eval_mode_out[num_eval_mode_out] = eval_mode;
+          num_eval_mode_out += 1;
           break;
         case CEED_EVAL_GRAD:
-          CeedCallBackend(CeedRealloc(num_e_mode_out + dim, &e_mode_out));
-          for (CeedInt d = 0; d < dim; d++) e_mode_out[num_e_mode_out + d] = e_mode;
-          num_e_mode_out += dim;
+          CeedCallBackend(CeedRealloc(num_eval_mode_out + dim, &eval_mode_out));
+          for (CeedInt d = 0; d < dim; d++) eval_mode_out[num_eval_mode_out + d] = eval_mode;
+          num_eval_mode_out += dim;
           break;
         case CEED_EVAL_WEIGHT:
         case CEED_EVAL_DIV:
@@ -715,6 +730,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
           break;  // Caught by QF Assembly
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Operator data struct
@@ -723,17 +739,18 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedCalloc(1, &impl->diag));
   CeedOperatorDiag_Sycl *diag = impl->diag;
 
-  diag->basis_in       = basis_in;
-  diag->basis_out      = basis_out;
-  diag->h_e_mode_in    = e_mode_in;
-  diag->h_e_mode_out   = e_mode_out;
-  diag->num_e_mode_in  = num_e_mode_in;
-  diag->num_e_mode_out = num_e_mode_out;
+  CeedCallBackend(CeedBasisReferenceCopy(basis_in, &diag->basis_in));
+  CeedCallBackend(CeedBasisReferenceCopy(basis_out, &diag->basis_out));
+  diag->h_eval_mode_in    = eval_mode_in;
+  diag->h_eval_mode_out   = eval_mode_out;
+  diag->num_eval_mode_in  = num_eval_mode_in;
+  diag->num_eval_mode_out = num_eval_mode_out;
 
   // Kernel parameters
   CeedInt num_nodes, num_qpts;
   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
+  CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
   diag->num_nodes = num_nodes;
   diag->num_qpts  = num_qpts;
   diag->num_comp  = num_comp;
@@ -746,8 +763,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   // CEED_EVAL_NONE
   CeedScalar *identity      = NULL;
   bool        has_eval_none = false;
-  for (CeedInt i = 0; i < num_e_mode_in; i++) has_eval_none = has_eval_none || (e_mode_in[i] == CEED_EVAL_NONE);
-  for (CeedInt i = 0; i < num_e_mode_out; i++) has_eval_none = has_eval_none || (e_mode_out[i] == CEED_EVAL_NONE);
+  for (CeedInt i = 0; i < num_eval_mode_in; i++) has_eval_none = has_eval_none || (eval_mode_in[i] == CEED_EVAL_NONE);
+  for (CeedInt i = 0; i < num_eval_mode_out; i++) has_eval_none = has_eval_none || (eval_mode_out[i] == CEED_EVAL_NONE);
 
   std::vector<sycl::event> e;
 
@@ -785,20 +802,27 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
   sycl::event grad_out_copy = sycl_data->sycl_queue.copy<CeedScalar>(grad_out, diag->d_grad_out, g_len, e);
   copy_events.push_back(grad_out_copy);
 
-  // Arrays of  e_modes
-  CeedCallSycl(ceed, diag->d_e_mode_in = sycl::malloc_device<CeedEvalMode>(num_e_mode_in, sycl_data->sycl_device, sycl_data->sycl_context));
-  sycl::event e_mode_in_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(e_mode_in, diag->d_e_mode_in, num_e_mode_in, e);
-  copy_events.push_back(e_mode_in_copy);
+  // Arrays of  eval_modes
+  CeedCallSycl(ceed, diag->d_eval_mode_in = sycl::malloc_device<CeedEvalMode>(num_eval_mode_in, sycl_data->sycl_device, sycl_data->sycl_context));
+  sycl::event eval_mode_in_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(eval_mode_in, diag->d_eval_mode_in, num_eval_mode_in, e);
+  copy_events.push_back(eval_mode_in_copy);
 
-  CeedCallSycl(ceed, diag->d_e_mode_out = sycl::malloc_device<CeedEvalMode>(num_e_mode_out, sycl_data->sycl_device, sycl_data->sycl_context));
-  sycl::event e_mode_out_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(e_mode_out, diag->d_e_mode_out, num_e_mode_out, e);
-  copy_events.push_back(e_mode_out_copy);
+  CeedCallSycl(ceed, diag->d_eval_mode_out = sycl::malloc_device<CeedEvalMode>(num_eval_mode_out, sycl_data->sycl_device, sycl_data->sycl_context));
+  sycl::event eval_mode_out_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(eval_mode_out, diag->d_eval_mode_out, num_eval_mode_out, e);
+  copy_events.push_back(eval_mode_out_copy);
 
   // Restriction
-  diag->diag_rstr = rstr_out;
+  CeedCallBackend(CeedElemRestrictionReferenceCopy(rstr_out, &diag->diag_rstr));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
 
   // Wait for all copies to complete and handle exceptions
   CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
+
+  // Cleanup
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -807,18 +831,18 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool is_point_block, const CeedInt num_elem,
                                            const CeedOperatorDiag_Sycl *diag, const CeedScalar *assembled_qf_array, CeedScalar *elem_diag_array) {
-  const CeedSize      num_nodes      = diag->num_nodes;
-  const CeedSize      num_qpts       = diag->num_qpts;
-  const CeedSize      num_comp       = diag->num_comp;
-  const CeedSize      num_e_mode_in  = diag->num_e_mode_in;
-  const CeedSize      num_e_mode_out = diag->num_e_mode_out;
-  const CeedScalar   *identity       = diag->d_identity;
-  const CeedScalar   *interp_in      = diag->d_interp_in;
-  const CeedScalar   *grad_in        = diag->d_grad_in;
-  const CeedScalar   *interp_out     = diag->d_interp_out;
-  const CeedScalar   *grad_out       = diag->d_grad_out;
-  const CeedEvalMode *e_mode_in      = diag->d_e_mode_in;
-  const CeedEvalMode *e_mode_out     = diag->d_e_mode_out;
+  const CeedSize      num_nodes         = diag->num_nodes;
+  const CeedSize      num_qpts          = diag->num_qpts;
+  const CeedSize      num_comp          = diag->num_comp;
+  const CeedSize      num_eval_mode_in  = diag->num_eval_mode_in;
+  const CeedSize      num_eval_mode_out = diag->num_eval_mode_out;
+  const CeedScalar   *identity          = diag->d_identity;
+  const CeedScalar   *interp_in         = diag->d_interp_in;
+  const CeedScalar   *grad_in           = diag->d_grad_in;
+  const CeedScalar   *interp_out        = diag->d_interp_out;
+  const CeedScalar   *grad_out          = diag->d_grad_out;
+  const CeedEvalMode *eval_mode_in      = diag->d_eval_mode_in;
+  const CeedEvalMode *eval_mode_out     = diag->d_eval_mode_out;
 
   sycl::range<1> kernel_range(num_elem * num_nodes);
 
@@ -834,18 +858,18 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i
     // Each element
     CeedInt d_out = -1;
     // Each basis eval mode pair
-    for (CeedSize e_out = 0; e_out < num_e_mode_out; e_out++) {
+    for (CeedSize e_out = 0; e_out < num_eval_mode_out; e_out++) {
       const CeedScalar *bt = NULL;
 
-      if (e_mode_out[e_out] == CEED_EVAL_GRAD) ++d_out;
-      CeedOperatorGetBasisPointer_Sycl(&bt, e_mode_out[e_out], identity, interp_out, &grad_out[d_out * num_qpts * num_nodes]);
+      if (eval_mode_out[e_out] == CEED_EVAL_GRAD) ++d_out;
+      CeedOperatorGetBasisPointer_Sycl(&bt, eval_mode_out[e_out], identity, interp_out, &grad_out[d_out * num_qpts * num_nodes]);
       CeedInt d_in = -1;
 
-      for (CeedSize e_in = 0; e_in < num_e_mode_in; e_in++) {
+      for (CeedSize e_in = 0; e_in < num_eval_mode_in; e_in++) {
         const CeedScalar *b = NULL;
 
-        if (e_mode_in[e_in] == CEED_EVAL_GRAD) ++d_in;
-        CeedOperatorGetBasisPointer_Sycl(&b, e_mode_in[e_in], identity, interp_in, &grad_in[d_in * num_qpts * num_nodes]);
+        if (eval_mode_in[e_in] == CEED_EVAL_GRAD) ++d_in;
+        CeedOperatorGetBasisPointer_Sycl(&b, eval_mode_in[e_in], identity, interp_in, &grad_in[d_in * num_qpts * num_nodes]);
         // Each component
         for (CeedSize comp_out = 0; comp_out < num_comp; comp_out++) {
           // Each qpoint/node pair
@@ -856,7 +880,7 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i
 
               for (CeedSize q = 0; q < num_qpts; q++) {
                 const CeedScalar qf_value =
-                    assembled_qf_array[((((e_in * num_comp + comp_in) * num_e_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts +
+                    assembled_qf_array[((((e_in * num_comp + comp_in) * num_eval_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts +
                                        q];
 
                 e_value += bt[q * num_nodes + tid] * qf_value * b[q * num_nodes + tid];
@@ -869,7 +893,8 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i
 
             for (CeedSize q = 0; q < num_qpts; q++) {
               const CeedScalar qf_value =
-                  assembled_qf_array[((((e_in * num_comp + comp_out) * num_e_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts + q];
+                  assembled_qf_array[((((e_in * num_comp + comp_out) * num_eval_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts +
+                                     q];
               e_value += bt[q * num_nodes + tid] * qf_value * b[q * num_nodes + tid];
             }
             elem_diag_array[(comp_out * num_elem + e) * num_nodes + tid] += e_value;
@@ -885,22 +910,26 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i
 // Assemble diagonal common code
 //------------------------------------------------------------------------------
 static inline int CeedOperatorAssembleDiagonalCore_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) {
-  Ceed                ceed;
-  Ceed_Sycl          *sycl_data;
-  CeedInt             num_elem;
-  CeedScalar         *elem_diag_array;
-  const CeedScalar   *assembled_qf_array;
-  CeedVector          assembled_qf = NULL;
-  CeedElemRestriction rstr         = NULL;
-  CeedOperator_Sycl  *impl;
+  Ceed               ceed;
+  Ceed_Sycl         *sycl_data;
+  CeedInt            num_elem;
+  CeedScalar        *elem_diag_array;
+  const CeedScalar  *assembled_qf_array;
+  CeedVector         assembled_qf = NULL;
+  CeedOperator_Sycl *impl;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedGetData(ceed, &sycl_data));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Assemble QFunction
-  CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request));
-  CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
+  {
+    CeedElemRestriction elem_rstr = NULL;
+
+    CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &elem_rstr, request));
+    CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+  }
 
   // Setup
   if (!impl->diag) {
@@ -968,9 +997,9 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl(CeedOperator op,
 //------------------------------------------------------------------------------
 // Single operator assembly setup
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
+static int CeedOperatorAssembleSingleSetup_Sycl(CeedOperator op) {
   Ceed    ceed;
-  CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_e_mode_out = 0,
+  CeedInt num_input_fields, num_output_fields, num_eval_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_eval_mode_out = 0,
                                                num_B_out_mats_to_load = 0, size_B_out = 0, num_qpts = 0, elem_size = 0, num_elem, num_comp,
                                                mat_start = 0;
   CeedEvalMode       *eval_mode_in = NULL, *eval_mode_out = NULL;
@@ -992,63 +1021,81 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
   // Note that the kernel will treat each dimension of a gradient action separately;
-  // i.e., when an active input has a CEED_EVAL_GRAD mode, num_ e_mode_in will increment by dim.
-  // However, for the purposes of load_ing the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once,
+  // i.e., when an active input has a CEED_EVAL_GRAD mode, num_ eval_mode_in will increment by dim.
+  // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once,
   // so num_B_in_mats_to_load will be incremented by 1.
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedEvalMode eval_mode;
-    CeedVector   vec;
+    CeedVector vec;
 
     CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in));
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &elem_rstr));
+      if (!rstr_in) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_in));
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+      CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size));
+      CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis));
+      if (!basis_in) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_in));
+      CeedCheck(basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedBasisGetDimension(basis_in, &dim));
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in));
-      CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       if (eval_mode != CEED_EVAL_NONE) {
         CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in));
         eval_mode_in[num_B_in_mats_to_load] = eval_mode;
         num_B_in_mats_to_load += 1;
         if (eval_mode == CEED_EVAL_GRAD) {
-          num_e_mode_in += dim;
+          num_eval_mode_in += dim;
           size_B_in += dim * elem_size * num_qpts;
         } else {
-          num_e_mode_in += 1;
+          num_eval_mode_in += 1;
           size_B_in += elem_size * num_qpts;
         }
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis; basis_out and rstr_out only used if same as input, TODO
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedEvalMode eval_mode;
-    CeedVector   vec;
+    CeedVector vec;
 
     CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out));
-      CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out));
-      CeedCheck(!rstr_out || rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly");
+      CeedEvalMode        eval_mode;
+      CeedElemRestriction elem_rstr;
+      CeedBasis           basis;
+
+      CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &elem_rstr));
+      if (!rstr_out) CeedCallBackend(CeedElemRestrictionReferenceCopy(elem_rstr, &rstr_out));
+      CeedCheck(rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly");
+      CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
+      CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis));
+      if (!basis_out) CeedCallBackend(CeedBasisReferenceCopy(basis, &basis_out));
+      CeedCheck(basis_out == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases");
+      CeedCallBackend(CeedBasisDestroy(&basis));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       if (eval_mode != CEED_EVAL_NONE) {
         CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out));
         eval_mode_out[num_B_out_mats_to_load] = eval_mode;
         num_B_out_mats_to_load += 1;
         if (eval_mode == CEED_EVAL_GRAD) {
-          num_e_mode_out += dim;
+          num_eval_mode_out += dim;
           size_B_out += dim * elem_size * num_qpts;
         } else {
-          num_e_mode_out += 1;
+          num_eval_mode_out += 1;
           size_B_out += elem_size * num_qpts;
         }
       }
     }
+    CeedCallBackend(CeedVectorDestroy(&vec));
   }
-  CeedCheck(num_e_mode_in > 0 && num_e_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
+  CeedCheck(num_eval_mode_in > 0 && num_eval_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
 
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem));
   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp));
@@ -1061,16 +1108,16 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
   CeedCallBackend(CeedGetData(ceed, &sycl_data));
 
   // Kernel setup
-  int elems_per_block   = 1;
-  asmb->elems_per_block = elems_per_block;
-  asmb->block_size_x    = elem_size;
-  asmb->block_size_y    = elem_size;
-  asmb->num_e_mode_in   = num_e_mode_in;
-  asmb->num_e_mode_out  = num_e_mode_out;
-  asmb->num_qpts        = num_qpts;
-  asmb->num_nodes       = elem_size;
-  asmb->block_size      = elem_size * elem_size * elems_per_block;
-  asmb->num_comp        = num_comp;
+  int elems_per_block     = 1;
+  asmb->elems_per_block   = elems_per_block;
+  asmb->block_size_x      = elem_size;
+  asmb->block_size_y      = elem_size;
+  asmb->num_eval_mode_in  = num_eval_mode_in;
+  asmb->num_eval_mode_out = num_eval_mode_out;
+  asmb->num_qpts          = num_qpts;
+  asmb->num_nodes         = elem_size;
+  asmb->block_size        = elem_size * elem_size * elems_per_block;
+  asmb->num_comp          = num_comp;
 
   // Build 'full' B matrices (not 1D arrays used for tensor-product matrices
   CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in));
@@ -1127,6 +1174,12 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
       mat_start += dim * elem_size * num_qpts;
     }
   }
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCallBackend(CeedElemRestrictionDestroy(&rstr_out));
+  CeedCallBackend(CeedBasisDestroy(&basis_in));
+  CeedCallBackend(CeedBasisDestroy(&basis_out));
+  CeedCallBackend(CeedQFunctionDestroy(&qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1137,25 +1190,25 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp
                                            CeedScalar *values_array) {
   // This kernels assumes B_in and B_out have the same number of quadrature points and basis points.
   // TODO: expand to more general cases
-  CeedOperatorAssemble_Sycl *asmb           = impl->asmb;
-  const CeedInt              num_elem       = asmb->num_elem;
-  const CeedSize             num_nodes      = asmb->num_nodes;
-  const CeedSize             num_comp       = asmb->num_comp;
-  const CeedSize             num_qpts       = asmb->num_qpts;
-  const CeedSize             num_e_mode_in  = asmb->num_e_mode_in;
-  const CeedSize             num_e_mode_out = asmb->num_e_mode_out;
+  CeedOperatorAssemble_Sycl *asmb              = impl->asmb;
+  const CeedInt              num_elem          = asmb->num_elem;
+  const CeedSize             num_nodes         = asmb->num_nodes;
+  const CeedSize             num_comp          = asmb->num_comp;
+  const CeedSize             num_qpts          = asmb->num_qpts;
+  const CeedSize             num_eval_mode_in  = asmb->num_eval_mode_in;
+  const CeedSize             num_eval_mode_out = asmb->num_eval_mode_out;
 
   // Strides for final output ordering, determined by the reference (inference) implementation of the symbolic assembly, slowest --> fastest: element,
   // comp_in, comp_out, node_row, node_col
   const CeedSize comp_out_stride = num_nodes * num_nodes;
   const CeedSize comp_in_stride  = comp_out_stride * num_comp;
   const CeedSize e_stride        = comp_in_stride * num_comp;
-  // Strides for QF array, slowest --> fastest:  e_mode_in, comp_in,  e_mode_out, comp_out, elem, qpt
-  const CeedSize q_e_stride          = num_qpts;
-  const CeedSize q_comp_out_stride   = num_elem * q_e_stride;
-  const CeedSize q_e_mode_out_stride = q_comp_out_stride * num_comp;
-  const CeedSize q_comp_in_stride    = q_e_mode_out_stride * num_e_mode_out;
-  const CeedSize q_e_mode_in_stride  = q_comp_in_stride * num_comp;
+  // Strides for QF array, slowest --> fastest:  eval_mode_in, comp_in,  eval_mode_out, comp_out, elem, qpt
+  const CeedSize q_e_stride             = num_qpts;
+  const CeedSize q_comp_out_stride      = num_elem * q_e_stride;
+  const CeedSize q_eval_mode_out_stride = q_comp_out_stride * num_comp;
+  const CeedSize q_comp_in_stride       = q_eval_mode_out_stride * num_eval_mode_out;
+  const CeedSize q_eval_mode_in_stride  = q_comp_in_stride * num_comp;
 
   CeedScalar *B_in, *B_out;
   B_in                       = asmb->d_B_in;
@@ -1178,24 +1231,24 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp
         CeedScalar result        = 0.0;
         CeedSize   qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e;
 
-        for (CeedSize e_mode_in = 0; e_mode_in < num_e_mode_in; e_mode_in++) {
-          CeedSize b_in_index = e_mode_in * num_qpts * num_nodes;
+        for (CeedSize eval_mode_in = 0; eval_mode_in < num_eval_mode_in; eval_mode_in++) {
+          CeedSize b_in_index = eval_mode_in * num_qpts * num_nodes;
 
-          for (CeedSize e_mode_out = 0; e_mode_out < num_e_mode_out; e_mode_out++) {
-            CeedSize b_out_index = e_mode_out * num_qpts * num_nodes;
-            CeedSize qf_index    = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in;
+          for (CeedSize eval_mode_out = 0; eval_mode_out < num_eval_mode_out; eval_mode_out++) {
+            CeedSize b_out_index = eval_mode_out * num_qpts * num_nodes;
+            CeedSize qf_index    = qf_index_comp + q_eval_mode_out_stride * eval_mode_out + q_eval_mode_in_stride * eval_mode_in;
 
             // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
             for (CeedSize j = 0; j < num_qpts; j++) {
               result += B_out[b_out_index + j * num_nodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * num_nodes + l];
             }
-          }  // end of  e_mode_out
-        }    // end of  e_mode_in
+          }  // end of  eval_mode_out
+        }  // end of  eval_mode_in
         CeedSize val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + num_nodes * i + l;
 
         values_array[val_index] = result;
       }  // end of out component
-    }    // end of in component
+    }  // end of in component
   });
   return CEED_ERROR_SUCCESS;
 }
@@ -1213,20 +1266,20 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons
   const CeedInt              num_nodes      = asmb->num_nodes;
   const CeedInt              num_comp       = asmb->num_comp;
   const CeedInt              num_qpts       = asmb->num_qpts;
-  const CeedInt              num_e_mode_in  = asmb->num_e_mode_in;
-  const CeedInt              num_e_mode_out = asmb->num_e_mode_out;
+  const CeedInt              num_eval_mode_in  = asmb->num_eval_mode_in;
+  const CeedInt              num_eval_mode_out = asmb->num_eval_mode_out;
 
   // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: elememt,
   // comp_in, comp_out, node_row, node_col
   const CeedInt comp_out_stride = num_nodes * num_nodes;
   const CeedInt comp_in_stride  = comp_out_stride * num_comp;
   const CeedInt e_stride        = comp_in_stride * num_comp;
-  // Strides for QF array, slowest --> fastest:  e_mode_in, comp_in,  e_mode_out, comp_out, elem, qpt
+  // Strides for QF array, slowest --> fastest:  eval_mode_in, comp_in,  eval_mode_out, comp_out, elem, qpt
   const CeedInt q_e_stride         = num_qpts;
   const CeedInt q_comp_out_stride  = num_elem * q_e_stride;
-  const CeedInt q_e_mode_out_stride = q_comp_out_stride * num_comp;
-  const CeedInt q_comp_in_stride   = q_e_mode_out_stride * num_e_mode_out;
-  const CeedInt q_e_mode_in_stride  = q_comp_in_stride * num_comp;
+  const CeedInt q_eval_mode_out_stride = q_comp_out_stride * num_comp;
+  const CeedInt q_comp_in_stride   = q_eval_mode_out_stride * num_eval_mode_out;
+  const CeedInt q_eval_mode_in_stride  = q_comp_in_stride * num_comp;
 
   CeedScalar *B_in, *B_out;
   B_in                        = asmb->d_B_in;
@@ -1255,17 +1308,17 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons
           for (CeedInt i = 0; i < num_nodes; i++) {
             CeedScalar result        = 0.0;
             CeedInt    qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e;
-            for (CeedInt  e_mode_in = 0;  e_mode_in < num_e_mode_in;  e_mode_in++) {
-              CeedInt b_in_index =  e_mode_in * num_qpts * num_nodes;
-              for (CeedInt  e_mode_out = 0;  e_mode_out < num_e_mode_out;  e_mode_out++) {
-                CeedInt b_out_index =  e_mode_out * num_qpts * num_nodes;
-                CeedInt qf_index    = qf_index_comp + q_e_mode_out_stride *  e_mode_out + q_e_mode_in_stride *  e_mode_in;
+            for (CeedInt  eval_mode_in = 0;  eval_mode_in < num_eval_mode_in;  eval_mode_in++) {
+              CeedInt b_in_index =  eval_mode_in * num_qpts * num_nodes;
+              for (CeedInt  eval_mode_out = 0;  eval_mode_out < num_eval_mode_out;  eval_mode_out++) {
+                CeedInt b_out_index =  eval_mode_out * num_qpts * num_nodes;
+                CeedInt qf_index    = qf_index_comp + q_eval_mode_out_stride *  eval_mode_out + q_eval_mode_in_stride *  eval_mode_in;
                 // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
                 for (CeedInt j = 0; j < num_qpts; j++) {
                   result += B_out[b_out_index + j * num_nodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * num_nodes + l];
                 }
-              }  // end of  e_mode_out
-            }    // end of  e_mode_in
+              }  // end of  eval_mode_out
+            }    // end of  eval_mode_in
             CeedInt val_index       = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + num_nodes * i + l;
             values_array[val_index] = result;
           }  // end of loop over element node index, i
@@ -1284,7 +1337,7 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons
 // input restriction/basis per operator (could have multiple basis eval modes).
 // TODO: allow multiple active input restrictions/basis objects
 //------------------------------------------------------------------------------
-static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, CeedVector values) {
+static int CeedOperatorAssembleSingle_Sycl(CeedOperator op, CeedInt offset, CeedVector values) {
   Ceed                ceed;
   Ceed_Sycl          *sycl_data;
   CeedScalar         *values_array;
@@ -1294,12 +1347,13 @@ static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, Ceed
   CeedOperator_Sycl  *impl;
 
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &impl));
   CeedCallBackend(CeedGetData(ceed, &sycl_data));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Setup
   if (!impl->asmb) {
-    CeedCallBackend(CeedSingleOperatorAssembleSetup_Sycl(op));
+    CeedCallBackend(CeedOperatorAssembleSingleSetup_Sycl(op));
     assert(impl->asmb != NULL);
   }
 
@@ -1341,11 +1395,12 @@ int CeedOperatorCreate_Sycl(CeedOperator op) {
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Sycl));
-  CeedCallBackend(
-      CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl));
-  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Sycl));
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal",
+                                            CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl));
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "LinearAssembleSingle", CeedOperatorAssembleSingle_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp
index 23e792f90e..7a68343f29 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
index 759b9b9a5a..82cac87b6d 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -35,8 +35,8 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
   CeedQFunctionField *input_fields, *output_fields;
   CeedQFunction_Sycl *impl;
 
-  CeedCallBackend(CeedQFunctionGetData(qf, (void **)&impl));
   // QFunction is built
+  CeedCallBackend(CeedQFunctionGetData(qf, (void **)&impl));
   if (impl->QFunction) return CEED_ERROR_SUCCESS;
 
   CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
@@ -118,7 +118,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
   for (CeedInt i = 0; i < num_input_fields; ++i) {
     code << "  CeedScalar U_" << i << "[" << input_sizes[i] << "];\n";
   }
-  code << "  const CeedScalar *inputs[" << num_input_fields << "] = {U_0";
+  code << "  const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "] = {U_0";
   for (CeedInt i = 1; i < num_input_fields; i++) {
     code << ", U_" << i << "\n";
   }
@@ -129,7 +129,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
     code << "  CeedScalar V_" << i << "[" << output_sizes[i] << "];\n";
   }
-  code << "  CeedScalar *outputs[" << num_output_fields << "] = {V_0";
+  code << "  CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "] = {V_0";
   for (CeedInt i = 1; i < num_output_fields; i++) {
     code << ", V_" << i << "\n";
   }
@@ -175,6 +175,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
   CeedCallBackend(CeedFree(&qfunction_source));
   CeedCallBackend(CeedFree(&read_write_kernel_path));
   CeedCallBackend(CeedFree(&read_write_kernel_source));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
index 4de8fcf379..fb0ad6f287 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
@@ -37,6 +37,7 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C
 
   CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
   CeedCallBackend(CeedGetData(ceed, &ceed_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
 
   CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
 
@@ -118,6 +119,7 @@ static int CeedQFunctionDestroy_Sycl(CeedQFunction qf) {
   delete impl->QFunction;
   delete impl->sycl_module;
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -134,6 +136,7 @@ int CeedQFunctionCreate_Sycl(CeedQFunction qf) {
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
index 1c942a645b..1a08c26cb5 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -42,6 +42,7 @@ static inline int CeedQFunctionContextSyncH2D_Sycl(const CeedQFunctionContext ct
   if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
   sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->d_data, impl->h_data, ctx_size, e);
   CeedCallSycl(ceed, copy_event.wait_and_throw());
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -75,6 +76,7 @@ static inline int CeedQFunctionContextSyncD2H_Sycl(const CeedQFunctionContext ct
   if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
   sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->h_data, impl->d_data, ctx_size, e);
   CeedCallSycl(ceed, copy_event.wait_and_throw());
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -88,7 +90,9 @@ static inline int CeedQFunctionContextSync_Sycl(const CeedQFunctionContext ctx,
     case CEED_MEM_DEVICE:
       return CeedQFunctionContextSyncH2D_Sycl(ctx);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -229,6 +233,7 @@ static int CeedQFunctionContextSetDataDevice_Sycl(const CeedQFunctionContext ctx
       impl->d_data          = data;
     } break;
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -237,9 +242,6 @@ static int CeedQFunctionContextSetDataDevice_Sycl(const CeedQFunctionContext ctx
 //   freeing any previously allocated data if applicable
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextSetData_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, const CeedCopyMode copy_mode, void *data) {
-  Ceed ceed;
-
-  CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCallBackend(CeedQFunctionContextSetAllInvalid_Sycl(ctx));
   switch (mem_type) {
     case CEED_MEM_HOST:
@@ -247,7 +249,9 @@ static int CeedQFunctionContextSetData_Sycl(const CeedQFunctionContext ctx, cons
     case CEED_MEM_DEVICE:
       return CeedQFunctionContextSetDataDevice_Sycl(ctx, copy_mode, data);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -260,8 +264,9 @@ static int CeedQFunctionContextTakeData_Sycl(const CeedQFunctionContext ctx, con
   CeedQFunctionContext_Sycl *impl;
 
   CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
-  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
   CeedCallBackend(CeedGetData(ceed, &ceedSycl));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
 
   // Order queue if needed
   if (!ceedSycl->sycl_queue.is_in_order()) ceedSycl->sycl_queue.ext_oneapi_submit_barrier();
@@ -291,11 +296,9 @@ static int CeedQFunctionContextTakeData_Sycl(const CeedQFunctionContext ctx, con
 //   If a different memory type is most up to date, this will perform a copy
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextGetDataCore_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) {
-  Ceed                       ceed;
   bool                       need_sync = false;
   CeedQFunctionContext_Sycl *impl;
 
-  CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
 
   // Sync data to requested mem_type
@@ -325,11 +328,9 @@ static int CeedQFunctionContextGetDataRead_Sycl(const CeedQFunctionContext ctx,
 // Get read/write access to the data
 //------------------------------------------------------------------------------
 static int CeedQFunctionContextGetData_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) {
-  Ceed                       ceed;
   CeedQFunctionContext_Sycl *impl;
 
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
-  CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCallBackend(CeedQFunctionContextGetDataCore_Sycl(ctx, mem_type, data));
 
   // Mark only pointer for requested memory as valid
@@ -360,6 +361,7 @@ static int CeedQFunctionContextDestroy_Sycl(const CeedQFunctionContext ctx) {
   // Wait for all work to finish before freeing memory
   CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw());
   CeedCallSycl(ceed, sycl::free(impl->d_data_owned, sycl_data->sycl_context));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedFree(&impl->h_data_owned));
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
@@ -380,6 +382,7 @@ int CeedQFunctionContextCreate_Sycl(CeedQFunctionContext ctx) {
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp
index ae765dbafc..723fcc8cfb 100644
--- a/backends/sycl-ref/ceed-sycl-ref.hpp
+++ b/backends/sycl-ref/ceed-sycl-ref.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
@@ -86,16 +86,16 @@ typedef struct {
   CeedBasis           basis_in, basis_out;
   CeedElemRestriction diag_rstr, point_block_diag_rstr;
   CeedVector          elem_diag, point_block_elem_diag;
-  CeedInt             num_e_mode_in, num_e_mode_out, num_nodes;
+  CeedInt             num_eval_mode_in, num_eval_mode_out, num_nodes;
   CeedInt             num_qpts, num_comp;  // Kernel parameters
-  CeedEvalMode       *h_e_mode_in, *h_e_mode_out;
-  CeedEvalMode       *d_e_mode_in, *d_e_mode_out;
+  CeedEvalMode       *h_eval_mode_in, *h_eval_mode_out;
+  CeedEvalMode       *d_eval_mode_in, *d_eval_mode_out;
   CeedScalar         *d_identity, *d_interp_in, *d_interp_out, *d_grad_in, *d_grad_out;
 } CeedOperatorDiag_Sycl;
 
 typedef struct {
   CeedInt     num_elem, block_size_x, block_size_y, elems_per_block;
-  CeedInt     num_e_mode_in, num_e_mode_out, num_qpts, num_nodes, block_size, num_comp;  // Kernel parameters
+  CeedInt     num_eval_mode_in, num_eval_mode_out, num_qpts, num_nodes, block_size, num_comp;  // Kernel parameters
   bool        fallback;
   CeedScalar *d_B_in, *d_B_out;
 } CeedOperatorAssemble_Sycl;
@@ -106,7 +106,6 @@ typedef struct {
   CeedVector                *q_vecs_out;  // Output Q-vectors needed to apply operator
   CeedInt                    num_e_in;
   CeedInt                    num_e_out;
-  CeedInt                    num_inputs, num_outputs;
   CeedInt                    num_active_in, num_active_out;
   CeedVector                *qf_active_in;
   CeedOperatorDiag_Sycl     *diag;
diff --git a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp
index 6229003cb4..ffa5a78d7d 100644
--- a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
index d85d036587..d33d135198 100644
--- a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
@@ -195,6 +195,7 @@ static int CeedElemRestrictionApply_Sycl(CeedElemRestriction rstr, CeedTranspose
   // Restore arrays
   CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -202,10 +203,8 @@ static int CeedElemRestrictionApply_Sycl(CeedElemRestriction rstr, CeedTranspose
 // Get offsets
 //------------------------------------------------------------------------------
 static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction rstr, CeedMemType m_type, const CeedInt **offsets) {
-  Ceed                      ceed;
   CeedElemRestriction_Sycl *impl;
 
-  CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
 
   switch (m_type) {
@@ -240,6 +239,7 @@ static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction rstr) {
   CeedCallSycl(ceed, sycl::free(impl->d_t_indices, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_l_vec_indices, data->sycl_context));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -328,6 +328,7 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction rstr, const
   CeedCallBackend(CeedFree(&l_vec_indices));
   CeedCallBackend(CeedFree(&t_offsets));
   CeedCallBackend(CeedFree(&t_indices));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -472,5 +473,6 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApply_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
index 427f51f727..689d84f78e 100644
--- a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -44,8 +44,9 @@ static inline int CeedVectorSyncH2D_Sycl(const CeedVector vec) {
   CeedVector_Sycl *impl;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
-  CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+
   CeedCheck(impl->h_array, ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device");
 
   CeedCallBackend(CeedVectorGetLength(vec, &length));
@@ -63,6 +64,7 @@ static inline int CeedVectorSyncH2D_Sycl(const CeedVector vec) {
 
   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
   CeedCallSycl(ceed, data->sycl_queue.copy<CeedScalar>(impl->h_array, impl->d_array, length, e).wait_and_throw());
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -76,8 +78,8 @@ static inline int CeedVectorSyncD2H_Sycl(const CeedVector vec) {
   CeedVector_Sycl *impl;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
-  CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
 
   CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host");
 
@@ -96,6 +98,7 @@ static inline int CeedVectorSyncD2H_Sycl(const CeedVector vec) {
 
   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
   CeedCallSycl(ceed, data->sycl_queue.copy<CeedScalar>(impl->d_array, impl->h_array, length, e).wait_and_throw());
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -115,7 +118,9 @@ static int CeedVectorSyncArray_Sycl(const CeedVector vec, CeedMemType mem_type)
     case CEED_MEM_DEVICE:
       return CeedVectorSyncH2D_Sycl(vec);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -244,6 +249,7 @@ static int CeedVectorSetArrayDevice_Sycl(const CeedVector vec, const CeedCopyMod
       impl->d_array          = impl->d_array_borrowed;
       break;
   }
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -263,7 +269,9 @@ static int CeedVectorSetArray_Sycl(const CeedVector vec, const CeedMemType mem_t
     case CEED_MEM_DEVICE:
       return CeedVectorSetArrayDevice_Sycl(vec, copy_mode, array);
   }
+  // LCOV_EXCL_START
   return CEED_ERROR_UNSUPPORTED;
+  // LCOV_EXCL_STOP
 }
 
 //------------------------------------------------------------------------------
@@ -295,9 +303,10 @@ static int CeedVectorSetValue_Sycl(CeedVector vec, CeedScalar val) {
   CeedVector_Sycl *impl;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
-  CeedCallBackend(CeedGetData(ceed, &data));
 
   // Set value for synced device/host array
   if (!impl->d_array && !impl->h_array) {
@@ -333,8 +342,10 @@ static int CeedVectorTakeArray_Sycl(CeedVector vec, CeedMemType mem_type, CeedSc
   CeedVector_Sycl *impl;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
-  CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
+  CeedCallBackend(CeedDestroy(&ceed));
+  CeedCallBackend(CeedVectorGetData(vec, &impl));
 
   // Order queue if needed
   if (!data->sycl_queue.is_in_order()) data->sycl_queue.ext_oneapi_submit_barrier();
@@ -447,9 +458,10 @@ static int CeedVectorNorm_Sycl(CeedVector vec, CeedNormType type, CeedScalar *no
   CeedVector_Sycl  *impl;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
-  CeedCallBackend(CeedGetData(ceed, &data));
 
   // Compute norm
   CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array));
@@ -515,9 +527,10 @@ static int CeedVectorReciprocal_Sycl(CeedVector vec) {
   CeedVector_Sycl *impl;
 
   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorGetData(vec, &impl));
   CeedCallBackend(CeedVectorGetLength(vec, &length));
-  CeedCallBackend(CeedGetData(ceed, &data));
 
   // Set value for synced device/host array
   if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Sycl(data->sycl_queue, impl->d_array, length));
@@ -554,9 +567,10 @@ static int CeedVectorScale_Sycl(CeedVector x, CeedScalar alpha) {
   CeedVector_Sycl *x_impl;
 
   CeedCallBackend(CeedVectorGetCeed(x, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorGetData(x, &x_impl));
   CeedCallBackend(CeedVectorGetLength(x, &length));
-  CeedCallBackend(CeedGetData(ceed, &data));
 
   // Set value for synced device/host array
   if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Sycl(data->sycl_queue, x_impl->d_array, alpha, length));
@@ -593,10 +607,11 @@ static int CeedVectorAXPY_Sycl(CeedVector y, CeedScalar alpha, CeedVector x) {
   CeedVector_Sycl *y_impl, *x_impl;
 
   CeedCallBackend(CeedVectorGetCeed(y, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorGetData(y, &y_impl));
   CeedCallBackend(CeedVectorGetData(x, &x_impl));
   CeedCallBackend(CeedVectorGetLength(y, &length));
-  CeedCallBackend(CeedGetData(ceed, &data));
 
   // Set value for synced device/host array
   if (y_impl->d_array) {
@@ -639,11 +654,12 @@ static int CeedVectorPointwiseMult_Sycl(CeedVector w, CeedVector x, CeedVector y
   CeedVector_Sycl *w_impl, *x_impl, *y_impl;
 
   CeedCallBackend(CeedVectorGetCeed(w, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &data));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorGetData(w, &w_impl));
   CeedCallBackend(CeedVectorGetData(x, &x_impl));
   CeedCallBackend(CeedVectorGetData(y, &y_impl));
   CeedCallBackend(CeedVectorGetLength(w, &length));
-  CeedCallBackend(CeedGetData(ceed, &data));
 
   // Set value for synced device/host array
   if (!w_impl->d_array && !w_impl->h_array) {
@@ -681,6 +697,7 @@ static int CeedVectorDestroy_Sycl(const CeedVector vec) {
 
   CeedCallBackend(CeedFree(&impl->h_array_owned));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -711,6 +728,7 @@ int CeedVectorCreate_Sycl(CeedSize n, CeedVector vec) {
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "Scale", CeedVectorScale_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Sycl));
+  CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedVectorSetData(vec, impl));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/sycl-ref/kernels/sycl-ref-vector.cpp b/backends/sycl-ref/kernels/sycl-ref-vector.cpp
index 788b608f3a..11db777dce 100644
--- a/backends/sycl-ref/kernels/sycl-ref-vector.cpp
+++ b/backends/sycl-ref/kernels/sycl-ref-vector.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
diff --git a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
index 27ca11b6e5..162b2acb3c 100644
--- a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
+++ b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -106,6 +106,7 @@ int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, Ce
       //-----------
       std::vector<sycl::event> e;
 
+      CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[eval_mode]);
       if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()};
 
       ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
@@ -127,7 +128,7 @@ int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, Ce
   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
-
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -143,7 +144,7 @@ static int CeedBasisDestroy_Sycl_shared(CeedBasis basis) {
   CeedCallBackend(CeedBasisGetData(basis, &impl));
   CeedCallBackend(CeedGetData(ceed, &data));
   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
-  CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
+  if (impl->d_q_weight_1d) CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_collo_grad_1d, data->sycl_context));
@@ -156,6 +157,7 @@ static int CeedBasisDestroy_Sycl_shared(CeedBasis basis) {
   delete impl->sycl_module;
 
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -198,17 +200,23 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
 
   // Copy basis data to GPU
-  CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
-  sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);
+  std::vector<sycl::event> copy_events;
+  if (q_weight_1d) {
+    CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
+    sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);
+    copy_events.push_back(copy_weight);
+  }
 
   const CeedInt interp_length = Q_1d * P_1d;
   CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
   sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, e);
+  copy_events.push_back(copy_interp);
 
   CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
   sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, e);
+  copy_events.push_back(copy_grad);
 
-  CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
+  CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
 
   // Compute collocated gradient and copy to GPU
   impl->d_collo_grad_1d          = NULL;
@@ -270,6 +278,7 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Sycl_shared));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl_shared));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl-shared/ceed-sycl-shared.hpp b/backends/sycl-shared/ceed-sycl-shared.hpp
index e4a4c9f203..2e2c3df1ca 100644
--- a/backends/sycl-shared/ceed-sycl-shared.hpp
+++ b/backends/sycl-shared/ceed-sycl-shared.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
index d629e76f95..a563a73626 100644
--- a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
+++ b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -19,7 +19,7 @@
 //------------------------------------------------------------------------------
 static int CeedInit_Sycl_shared(const char *resource, Ceed ceed) {
   Ceed       ceed_ref;
-  Ceed_Sycl *data, *ref_data;
+  Ceed_Sycl *data;
   char      *resource_root;
 
   CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root));
diff --git a/backends/sycl/ceed-sycl-common.hpp b/backends/sycl/ceed-sycl-common.hpp
index e61cbebc18..f087f8c29a 100644
--- a/backends/sycl/ceed-sycl-common.hpp
+++ b/backends/sycl/ceed-sycl-common.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl/ceed-sycl-common.sycl.cpp b/backends/sycl/ceed-sycl-common.sycl.cpp
index e51405d7fa..aa09b693df 100644
--- a/backends/sycl/ceed-sycl-common.sycl.cpp
+++ b/backends/sycl/ceed-sycl-common.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other
 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
 // files for details.
 //
@@ -8,6 +8,7 @@
 
 #include "ceed-sycl-common.hpp"
 
+#include <sstream>
 #include <string>
 #include <sycl/sycl.hpp>
 
@@ -107,12 +108,14 @@ int CeedSetStream_Sycl(Ceed ceed, void *handle) {
   if (ceed_delegate) {
     CeedCallBackend(CeedSetStream_Sycl(ceed_delegate, handle));
   }
+  CeedCallBackend(CeedDestroy(&ceed_delegate));
 
   // Set queue and context for Ceed Fallback object
-  CeedGetOperatorFallbackCeed(ceed, &ceed_fallback);
+  CeedCallBackend(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
   if (ceed_fallback) {
     CeedCallBackend(CeedSetStream_Sycl(ceed_fallback, handle));
   }
+  CeedCallBackend(CeedDestroy(&ceed_fallback));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl/ceed-sycl-compile.hpp b/backends/sycl/ceed-sycl-compile.hpp
index 67db04f294..1baa1f3ca4 100644
--- a/backends/sycl/ceed-sycl-compile.hpp
+++ b/backends/sycl/ceed-sycl-compile.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp
index 9615114158..f939ca940f 100644
--- a/backends/sycl/ceed-sycl-compile.sycl.cpp
+++ b/backends/sycl/ceed-sycl-compile.sycl.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -61,7 +61,7 @@ static int CeedJitAddDefinitions_Sycl(Ceed ceed, const std::string &kernel_sourc
 // TODO: Add architecture flags, optimization flags
 //------------------------------------------------------------------------------
 static inline int CeedJitGetFlags_Sycl(std::vector<std::string> &flags) {
-  flags = {std::string("-cl-std=CL3.0"), std::string("-Dint32_t=int")};
+  flags = {std::string("-cl-std=CL3.0"), std::string("-Dint32_t=int"), std::string("-DCEED_RUNNING_JIT_PASS=1")};
   return CEED_ERROR_SUCCESS;
 }
 
@@ -106,7 +106,7 @@ static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, con
 
     zeModuleBuildLogGetString(lz_log, &log_size, nullptr);
 
-    CeedCall(CeedCalloc(log_size, &log_message));
+    CeedCallBackend(CeedCalloc(log_size, &log_message));
     zeModuleBuildLogGetString(lz_log, &log_size, log_message);
 
     return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to compile Level Zero module:\n%s", log_message);
@@ -157,8 +157,9 @@ int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::st
     return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to retrieve kernel from Level Zero module");
   }
 
-  *sycl_kernel = new sycl::kernel(sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
-      {*sycl_module, lz_kernel, sycl::ext::oneapi::level_zero::ownership::transfer}, data->sycl_context));
+  *sycl_kernel = new sycl::kernel(sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>({*sycl_module, lz_kernel,
+                                                                                           sycl::ext::oneapi::level_zero::ownership::transfer},
+                                                                                          data->sycl_context));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl/online_compiler.hpp b/backends/sycl/online_compiler.hpp
index f9fbf529fa..74d2577bc3 100644
--- a/backends/sycl/online_compiler.hpp
+++ b/backends/sycl/online_compiler.hpp
@@ -63,7 +63,7 @@ class device_arch {
 class online_compile_error : public sycl::exception {
  public:
   online_compile_error() = default;
-  online_compile_error(const std::string &Msg) : sycl::exception(Msg) {}
+  online_compile_error(const std::string &Msg) : sycl::exception(make_error_code(errc::invalid), Msg) {}
 };
 
 /// Designates a source language for the online compiler.
diff --git a/backends/weak/ceed-avx-weak.c b/backends/weak/ceed-avx-weak.c
new file mode 100644
index 0000000000..639c08f63b
--- /dev/null
+++ b/backends/weak/ceed-avx-weak.c
@@ -0,0 +1,12 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+// LCOV_EXCL_START
+#include "../ceed-backend-list-avx.h"
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/ceed-backend-weak.c b/backends/weak/ceed-backend-weak.c
similarity index 74%
rename from backends/ceed-backend-weak.c
rename to backends/weak/ceed-backend-weak.c
index e4c401f6a9..1ae70f81a6 100644
--- a/backends/ceed-backend-weak.c
+++ b/backends/weak/ceed-backend-weak.c
@@ -1,10 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
+#include "ceed-backend-weak.h"
 #include <ceed.h>
 #include <ceed/backend.h>
 #include <stdarg.h>
@@ -17,7 +18,7 @@ static int CeedInit_Weak(const char *resource, Ceed ceed) {
 }
 
 // This function provides a debug target for weak symbols
-static int CeedRegister_Weak(const char *name, int num_prefixes, ...) {
+int CeedRegister_Weak(const char *name, int num_prefixes, ...) {
   va_list prefixes;
   int     ierr;
 
@@ -36,9 +37,3 @@ static int CeedRegister_Weak(const char *name, int num_prefixes, ...) {
   return CEED_ERROR_SUCCESS;
 }
 // LCOV_EXCL_STOP
-
-#define CEED_BACKEND(name, num_prefixes, ...)       \
-  CEED_INTERN int name(void) __attribute__((weak)); \
-  int             name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
-#include "ceed-backend-list.h"
-#undef CEED_BACKEND
diff --git a/backends/weak/ceed-backend-weak.h b/backends/weak/ceed-backend-weak.h
new file mode 100644
index 0000000000..b828c44394
--- /dev/null
+++ b/backends/weak/ceed-backend-weak.h
@@ -0,0 +1,15 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <stdarg.h>
+
+CEED_INTERN int CeedRegister_Weak(const char *name, int num_prefixes, ...);
+
+#define CEED_BACKEND(name, num_prefixes, ...) \
+  CEED_INTERN int __attribute__((weak)) name(void) { return CeedRegister_Weak(__func__, num_prefixes, __VA_ARGS__); }
diff --git a/backends/weak/ceed-cuda-weak.c b/backends/weak/ceed-cuda-weak.c
new file mode 100644
index 0000000000..8bc81c78f5
--- /dev/null
+++ b/backends/weak/ceed-cuda-weak.c
@@ -0,0 +1,12 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+// LCOV_EXCL_START
+#include "../ceed-backend-list-cuda.h"
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/weak/ceed-hip-weak.c b/backends/weak/ceed-hip-weak.c
new file mode 100644
index 0000000000..ec90d3bdee
--- /dev/null
+++ b/backends/weak/ceed-hip-weak.c
@@ -0,0 +1,12 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+// LCOV_EXCL_START
+#include "../ceed-backend-list-hip.h"
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/weak/ceed-magma-weak.c b/backends/weak/ceed-magma-weak.c
new file mode 100644
index 0000000000..cace059504
--- /dev/null
+++ b/backends/weak/ceed-magma-weak.c
@@ -0,0 +1,12 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+// LCOV_EXCL_START
+#include "../ceed-backend-list-magma.h"
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/weak/ceed-memcheck-weak.c b/backends/weak/ceed-memcheck-weak.c
new file mode 100644
index 0000000000..35fd01613b
--- /dev/null
+++ b/backends/weak/ceed-memcheck-weak.c
@@ -0,0 +1,12 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+// LCOV_EXCL_START
+#include "../ceed-backend-list-memcheck.h"
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/weak/ceed-sycl-weak.c b/backends/weak/ceed-sycl-weak.c
new file mode 100644
index 0000000000..92bc508449
--- /dev/null
+++ b/backends/weak/ceed-sycl-weak.c
@@ -0,0 +1,12 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+// LCOV_EXCL_START
+#include "../ceed-backend-list-sycl.h"
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/weak/ceed-xsmm-weak.c b/backends/weak/ceed-xsmm-weak.c
new file mode 100644
index 0000000000..6ae36a2822
--- /dev/null
+++ b/backends/weak/ceed-xsmm-weak.c
@@ -0,0 +1,12 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-backend-weak.h"
+// LCOV_EXCL_START
+#include "../ceed-backend-list-xsmm.h"
+// LCOV_EXCL_STOP
+#undef CEED_BACKEND
diff --git a/backends/xsmm/ceed-xsmm-blocked.c b/backends/xsmm/ceed-xsmm-blocked.c
index 90dc19e741..2abaa247c1 100644
--- a/backends/xsmm/ceed-xsmm-blocked.c
+++ b/backends/xsmm/ceed-xsmm-blocked.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -25,6 +25,7 @@ static int CeedInit_Xsmm_Blocked(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Xsmm));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/xsmm/ceed-xsmm-serial.c b/backends/xsmm/ceed-xsmm-serial.c
index 68e51a63e3..7892e845be 100644
--- a/backends/xsmm/ceed-xsmm-serial.c
+++ b/backends/xsmm/ceed-xsmm-serial.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -25,6 +25,7 @@ static int CeedInit_Xsmm_Serial(const char *resource, Ceed ceed) {
   // Create reference Ceed that implementation will be dispatched through unless overridden
   CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+  CeedCallBackend(CeedDestroy(&ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Xsmm));
   return CEED_ERROR_SUCCESS;
diff --git a/backends/xsmm/ceed-xsmm-tensor.c b/backends/xsmm/ceed-xsmm-tensor.c
index 0d7383bf40..21bf22ef8b 100644
--- a/backends/xsmm/ceed-xsmm-tensor.c
+++ b/backends/xsmm/ceed-xsmm-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,10 +16,6 @@
 //------------------------------------------------------------------------------
 static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const CeedScalar *restrict t,
                                         CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v) {
-  Ceed ceed;
-
-  CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed));
-
   if (C == 1) {
     // Build or query the required kernel
     const int                  flags_t    = LIBXSMM_GEMM_FLAGS(!t_mode ? 'T' : 'N', 'N');
@@ -30,10 +26,10 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A,
                                                                             LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64)
                                                 : libxsmm_create_gemm_shape(J, A, B, !t_mode ? B : J, B, J, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32,
                                                                             LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32);
-    const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm_v2(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
+    const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
     libxsmm_gemm_param         gemm_param;
 
-    CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build.");
+    CeedCheck(kernel, CeedTensorContractReturnCeed(contract), CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build.");
 
     // Run kernel
     gemm_param.a.primary = (CeedScalar *)&t[0];
@@ -50,10 +46,10 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A,
                                                                             LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64)
                                                 : libxsmm_create_gemm_shape(C, J, B, C, !t_mode ? B : J, C, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32,
                                                                             LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32);
-    const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm_v2(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
+    const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE);
     libxsmm_gemm_param         gemm_param;
 
-    CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build.");
+    CeedCheck(kernel, CeedTensorContractReturnCeed(contract), CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build.");
 
     // Run kernel
     gemm_param.b.primary = (CeedScalar *)&t[0];
diff --git a/backends/xsmm/ceed-xsmm.h b/backends/xsmm/ceed-xsmm.h
index 0cb56591fe..124d8d4493 100644
--- a/backends/xsmm/ceed-xsmm.h
+++ b/backends/xsmm/ceed-xsmm.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/benchmark.sh b/benchmarks/benchmark.sh
index 167e374f7b..59ff3cc0d7 100755
--- a/benchmarks/benchmark.sh
+++ b/benchmarks/benchmark.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/petsc-bps.sh b/benchmarks/petsc-bps.sh
index 46ba51b73c..004dc4b5b8 100755
--- a/benchmarks/petsc-bps.sh
+++ b/benchmarks/petsc-bps.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/petsc-bpsraw.sh b/benchmarks/petsc-bpsraw.sh
index 666593c7d3..7099bb4ce1 100755
--- a/benchmarks/petsc-bpsraw.sh
+++ b/benchmarks/petsc-bpsraw.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/postprocess_base.py b/benchmarks/postprocess_base.py
index b9a8d46ddf..f69d283d38 100755
--- a/benchmarks/postprocess_base.py
+++ b/benchmarks/postprocess_base.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/postprocess_plot.py b/benchmarks/postprocess_plot.py
index 62939e54d7..59101837f3 100755
--- a/benchmarks/postprocess_plot.py
+++ b/benchmarks/postprocess_plot.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/benchmarks/postprocess_table.py b/benchmarks/postprocess_table.py
index 27a200e0b1..8822a346ff 100755
--- a/benchmarks/postprocess_table.py
+++ b/benchmarks/postprocess_table.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/ceed.pc.template b/ceed.pc.template
index 56bc5a076f..1d8458a4ee 100644
--- a/ceed.pc.template
+++ b/ceed.pc.template
@@ -1,6 +1,7 @@
 prefix=%prefix%
 includedir=${prefix}/include
 libdir=${prefix}/lib
+cflags_extra=%opt%
 
 Name: CEED
 Description: Code for Efficient Extensible Discretization
diff --git a/common.mk b/common.mk
index 4c466b8782..1a53bbf820 100644
--- a/common.mk
+++ b/common.mk
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/coverage.info b/coverage.info
new file mode 100644
index 0000000000..2e177c3e35
--- /dev/null
+++ b/coverage.info
@@ -0,0 +1,12848 @@
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/blocked/ceed-blocked-operator.c
+FNL:0,19,197
+FNA:0,192,CeedOperatorSetupFields_Blocked
+FNL:1,202,266
+FNA:1,96,CeedOperatorSetup_Blocked
+FNL:2,271,303
+FNA:2,96,CeedOperatorSetupInputs_Blocked
+FNL:3,308,354
+FNA:3,192,CeedOperatorInputBasis_Blocked
+FNL:4,359,400
+FNA:4,192,CeedOperatorOutputBasis_Blocked
+FNL:5,405,427
+FNA:5,96,CeedOperatorRestoreInputs_Blocked
+FNL:6,432,520
+FNA:6,96,CeedOperatorApplyAdd_Blocked
+FNL:7,525,719
+FNA:7,0,CeedOperatorLinearAssembleQFunctionCore_Blocked
+FNL:8,724,726
+FNA:8,0,CeedOperatorLinearAssembleQFunction_Blocked
+FNL:9,731,733
+FNA:9,0,CeedOperatorLinearAssembleQFunctionUpdate_Blocked
+FNL:10,738,775
+FNA:10,96,CeedOperatorDestroy_Blocked
+FNL:11,780,793
+FNA:11,96,CeedOperatorCreate_Blocked
+FNF:12
+FNH:9
+DA:19,192
+DA:31,192
+DA:32,192
+DA:33,192
+DA:34,192
+DA:36,192
+DA:37,96
+DA:38,96
+DA:40,96
+DA:41,96
+DA:45,480
+DA:49,288
+DA:50,288
+DA:57,240
+DA:58,240
+DA:59,240
+DA:60,240
+DA:61,240
+DA:62,240
+DA:63,240
+DA:65,240
+DA:66,240
+DA:67,144
+DA:68,144
+DA:70,144
+DA:71,144
+DA:73,144
+DA:74,144
+DA:75,0
+DA:76,0
+DA:77,0
+DA:79,0
+DA:80,0
+DA:81,0
+DA:83,0
+DA:84,0
+DA:85,0
+DA:86,0
+DA:87,0
+DA:88,0
+DA:90,0
+DA:91,0
+DA:92,0
+DA:95,0
+DA:96,0
+DA:97,0
+DA:98,96
+DA:101,96
+DA:102,96
+DA:104,96
+DA:105,0
+DA:107,0
+DA:109,240
+DA:110,240
+DA:111,240
+DA:114,288
+DA:115,96
+DA:116,96
+DA:117,96
+DA:118,96
+DA:119,96
+DA:120,144
+DA:124,144
+DA:125,144
+DA:126,144
+DA:127,144
+DA:128,144
+DA:129,144
+DA:130,144
+DA:131,144
+DA:132,144
+DA:133,144
+DA:134,48
+DA:135,48
+DA:136,48
+DA:137,48
+DA:138,48
+DA:139,48
+DA:140,48
+DA:144,192
+DA:145,288
+DA:149,192
+DA:150,192
+DA:151,288
+DA:155,96
+DA:156,96
+DA:157,96
+DA:158,0
+DA:159,0
+DA:160,0
+DA:162,96
+DA:163,96
+DA:165,192
+DA:166,192
+DA:169,192
+DA:173,96
+DA:174,96
+DA:175,96
+DA:179,0
+DA:180,0
+DA:181,0
+DA:182,0
+DA:183,0
+DA:184,0
+DA:185,0
+DA:186,0
+DA:188,0
+DA:189,0
+DA:191,96
+DA:192,96
+DA:195,192
+DA:196,192
+DA:202,96
+DA:205,96
+DA:211,96
+DA:212,96
+DA:214,96
+DA:215,96
+DA:216,96
+DA:217,96
+DA:218,96
+DA:219,96
+DA:222,96
+DA:223,96
+DA:225,96
+DA:226,96
+DA:227,96
+DA:228,96
+DA:229,96
+DA:230,96
+DA:231,96
+DA:232,96
+DA:233,96
+DA:235,96
+DA:236,96
+DA:240,96
+DA:243,96
+DA:248,96
+DA:252,0
+DA:253,0
+DA:254,0
+DA:256,0
+DA:257,0
+DA:259,0
+DA:263,96
+DA:264,96
+DA:265,96
+DA:271,96
+DA:274,288
+DA:281,192
+DA:282,192
+DA:283,192
+DA:284,96
+DA:285,96
+DA:288,192
+DA:289,192
+DA:292,144
+DA:293,144
+DA:294,144
+DA:296,144
+DA:298,144
+DA:300,192
+DA:302,96
+DA:308,192
+DA:311,576
+DA:318,384
+DA:322,0
+DA:323,0
+DA:324,0
+DA:325,0
+DA:329,384
+DA:330,384
+DA:331,384
+DA:332,384
+DA:333,384
+DA:335,384
+DA:336,96
+DA:337,96
+DA:338,96
+DA:339,192
+DA:343,192
+DA:344,192
+DA:345,192
+DA:346,192
+DA:347,192
+DA:348,192
+DA:349,96
+DA:350,96
+DA:353,192
+DA:359,192
+DA:362,384
+DA:369,192
+DA:370,192
+DA:371,192
+DA:372,192
+DA:374,192
+DA:375,96
+DA:376,96
+DA:377,96
+DA:381,96
+DA:382,96
+DA:383,96
+DA:385,96
+DA:386,0
+DA:388,96
+DA:390,96
+DA:391,96
+DA:399,192
+DA:405,96
+DA:407,288
+DA:411,192
+DA:415,0
+DA:416,0
+DA:417,0
+DA:418,0
+DA:420,192
+DA:421,192
+DA:423,144
+DA:426,96
+DA:432,96
+DA:434,96
+DA:436,96
+DA:443,96
+DA:445,96
+DA:448,96
+DA:449,0
+DA:450,0
+DA:451,0
+DA:453,96
+DA:454,96
+DA:455,96
+DA:456,96
+DA:457,96
+DA:458,96
+DA:461,96
+DA:464,192
+DA:465,96
+DA:466,0
+DA:468,96
+DA:473,288
+DA:475,384
+DA:476,192
+DA:477,192
+DA:478,96
+DA:479,96
+DA:485,192
+DA:488,192
+DA:489,192
+DA:493,192
+DA:498,192
+DA:502,96
+DA:504,96
+DA:506,96
+DA:507,96
+DA:509,96
+DA:511,96
+DA:513,96
+DA:517,96
+DA:518,96
+DA:519,96
+DA:525,0
+DA:529,0
+DA:531,0
+DA:537,0
+DA:538,0
+DA:539,0
+DA:540,0
+DA:541,0
+DA:543,0
+DA:544,0
+DA:545,0
+DA:546,0
+DA:547,0
+DA:548,0
+DA:549,0
+DA:552,0
+DA:555,0
+DA:558,0
+DA:561,0
+DA:562,0
+DA:567,0
+DA:568,0
+DA:569,0
+DA:570,0
+DA:571,0
+DA:573,0
+DA:575,0
+DA:576,0
+DA:580,0
+DA:581,0
+DA:586,0
+DA:587,0
+DA:588,0
+DA:589,0
+DA:591,0
+DA:593,0
+DA:594,0
+DA:598,0
+DA:599,0
+DA:601,0
+DA:602,0
+DA:604,0
+DA:607,0
+DA:608,0
+DA:610,0
+DA:612,0
+DA:616,0
+DA:617,0
+DA:618,0
+DA:621,0
+DA:624,0
+DA:628,0
+DA:630,0
+DA:633,0
+DA:639,0
+DA:640,0
+DA:641,0
+DA:642,0
+DA:643,0
+DA:644,0
+DA:649,0
+DA:650,0
+DA:651,0
+DA:654,0
+DA:656,0
+DA:661,0
+DA:663,0
+DA:664,0
+DA:665,0
+DA:666,0
+DA:668,0
+DA:671,0
+DA:677,0
+DA:678,0
+DA:679,0
+DA:680,0
+DA:681,0
+DA:687,0
+DA:688,0
+DA:689,0
+DA:696,0
+DA:697,0
+DA:701,0
+DA:702,0
+DA:703,0
+DA:705,0
+DA:710,0
+DA:713,0
+DA:714,0
+DA:715,0
+DA:716,0
+DA:717,0
+DA:718,0
+DA:724,0
+DA:725,0
+DA:731,0
+DA:732,0
+DA:738,96
+DA:741,96
+DA:743,96
+DA:744,96
+DA:745,96
+DA:746,96
+DA:747,384
+DA:748,288
+DA:749,288
+DA:751,96
+DA:752,96
+DA:753,96
+DA:755,288
+DA:756,192
+DA:757,192
+DA:759,96
+DA:760,96
+DA:762,192
+DA:763,96
+DA:764,96
+DA:766,96
+DA:767,96
+DA:770,96
+DA:771,96
+DA:773,96
+DA:774,96
+DA:780,96
+DA:784,96
+DA:785,96
+DA:786,96
+DA:787,96
+DA:788,96
+DA:789,96
+DA:790,96
+DA:791,96
+DA:792,96
+LF:393
+LH:244
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/blocked/ceed-blocked.c
+FNL:0,18,32
+FNA:0,48,CeedInit_Blocked
+FNL:1,37,37
+FNA:1,192,CeedRegister_Ref_Blocked
+FNF:2
+FNH:2
+DA:18,48
+DA:21,48
+DA:23,48
+DA:26,48
+DA:27,48
+DA:28,48
+DA:30,48
+DA:31,48
+DA:37,192
+LF:9
+LH:9
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-avx.h
+FNL:0,12,12
+FNA:0,192,CeedRegister_Avx_Blocked
+FNL:1,13,13
+FNA:1,192,CeedRegister_Avx_Serial
+FNF:2
+FNH:2
+DA:12,384
+DA:13,384
+LF:2
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-cuda.h
+FNL:0,12,12
+FNA:0,192,CeedRegister_Cuda
+FNL:1,13,13
+FNA:1,192,CeedRegister_Cuda_Gen
+FNL:2,14,14
+FNA:2,192,CeedRegister_Cuda_Shared
+FNF:3
+FNH:3
+DA:12,384
+DA:13,384
+DA:14,384
+LF:3
+LH:3
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-hip.h
+FNL:0,12,12
+FNA:0,192,CeedRegister_Hip
+FNL:1,13,13
+FNA:1,192,CeedRegister_Hip_Gen
+FNL:2,14,14
+FNA:2,192,CeedRegister_Hip_Shared
+FNF:3
+FNH:3
+DA:12,384
+DA:13,384
+DA:14,384
+LF:3
+LH:3
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-magma.h
+FNL:0,12,12
+FNA:0,192,CeedRegister_Magma
+FNL:1,13,13
+FNA:1,192,CeedRegister_Magma_Det
+FNF:2
+FNH:2
+DA:12,384
+DA:13,384
+LF:2
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-memcheck.h
+FNL:0,12,12
+FNA:0,0,CeedRegister_Memcheck_Blocked
+FNL:1,13,13
+FNA:1,0,CeedRegister_Memcheck_Serial
+FNF:2
+FNH:0
+DA:12,192
+DA:13,192
+LF:2
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-ref.h
+FNF:0
+FNH:0
+DA:12,192
+DA:13,192
+DA:14,192
+DA:15,192
+LF:4
+LH:4
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-sycl.h
+FNL:0,12,12
+FNA:0,192,CeedRegister_Sycl
+FNL:1,13,13
+FNA:1,192,CeedRegister_Sycl_Shared
+FNL:2,14,14
+FNA:2,192,CeedRegister_Sycl_Gen
+FNF:3
+FNH:3
+DA:12,384
+DA:13,384
+DA:14,384
+LF:3
+LH:3
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ceed-backend-list-xsmm.h
+FNL:0,12,12
+FNA:0,0,CeedRegister_Xsmm_Blocked
+FNL:1,13,13
+FNA:1,0,CeedRegister_Xsmm_Serial
+FNF:2
+FNH:0
+DA:12,192
+DA:13,192
+LF:2
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-blocked.c
+FNL:0,17,34
+FNA:0,24,CeedInit_Memcheck
+FNL:1,39,39
+FNA:1,192,CeedRegister_Memcheck_Blocked
+FNF:2
+FNH:2
+DA:17,24
+DA:20,24
+DA:23,24
+DA:24,24
+DA:25,24
+DA:27,24
+DA:28,24
+DA:29,24
+DA:30,24
+DA:31,24
+DA:32,24
+DA:33,24
+DA:39,192
+LF:13
+LH:13
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-qfunction.c
+FNL:0,111,124
+FNA:0,96,CeedQFunctionCreate_Memcheck
+FNL:1,19,93
+FNA:1,768,CeedQFunctionApply_Memcheck
+FNL:2,98,106
+FNA:2,96,CeedQFunctionDestroy_Memcheck
+FNF:3
+FNH:3
+DA:19,768
+DA:20,768
+DA:23,768
+DA:27,768
+DA:28,768
+DA:29,768
+DA:30,768
+DA:33,2304
+DA:35,1536
+DA:37,1536
+DA:39,1536
+DA:41,1536
+DA:42,1536
+DA:46,1536
+DA:48,768
+DA:50,768
+DA:52,768
+DA:53,768
+DA:55,768
+DA:56,768
+DA:60,768
+DA:63,2304
+DA:64,1536
+DA:65,1536
+DA:72,768
+DA:73,768
+DA:74,768
+DA:75,1536
+DA:80,768
+DA:81,768
+DA:82,50688
+DA:83,49920
+DA:87,768
+DA:88,768
+DA:91,768
+DA:92,768
+DA:98,96
+DA:101,96
+DA:102,96
+DA:103,96
+DA:104,96
+DA:105,96
+DA:111,96
+DA:115,96
+DA:116,96
+DA:117,96
+DA:118,96
+DA:119,96
+DA:120,96
+DA:121,96
+DA:122,96
+DA:123,96
+LF:52
+LH:52
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-qfunctioncontext.c
+FNL:0,116,140
+FNA:0,0,CeedQFunctionContextTakeData_Memcheck
+FNL:1,145,160
+FNA:1,396,CeedQFunctionContextGetData_Memcheck
+FNL:2,165,182
+FNA:2,0,CeedQFunctionContextGetDataRead_Memcheck
+FNL:3,187,203
+FNA:3,396,CeedQFunctionContextRestoreData_Memcheck
+FNL:4,19,25
+FNA:4,396,CeedQFunctionContextHasValidData_Memcheck
+FNL:5,208,225
+FNA:5,0,CeedQFunctionContextRestoreDataRead_Memcheck
+FNL:6,230,262
+FNA:6,144,CeedQFunctionContextDataDestroy_Memcheck
+FNL:7,267,285
+FNA:7,72,CeedQFunctionContextDestroy_Memcheck
+FNL:8,290,309
+FNA:8,72,CeedQFunctionContextCreate_Memcheck
+FNL:9,30,38
+FNA:9,0,CeedQFunctionContextHasBorrowedDataOfType_Memcheck
+FNL:10,43,89
+FNA:10,72,CeedQFunctionContextSetData_Memcheck
+FNL:11,94,111
+FNA:11,396,CeedQFunctionContextSyncData_Memcheck
+FNF:12
+FNH:8
+DA:19,396
+DA:22,396
+DA:23,396
+DA:24,396
+DA:30,0
+DA:33,0
+DA:35,0
+DA:36,0
+DA:37,0
+DA:43,72
+DA:47,72
+DA:49,72
+DA:50,72
+DA:53,72
+DA:54,0
+DA:55,0
+DA:57,72
+DA:58,72
+DA:59,0
+DA:60,0
+DA:62,72
+DA:65,72
+DA:68,72
+DA:69,0
+DA:70,0
+DA:71,0
+DA:72,0
+DA:73,24
+DA:74,24
+DA:75,24
+DA:76,24
+DA:77,24
+DA:78,48
+DA:79,48
+DA:80,48
+DA:81,48
+DA:85,72
+DA:86,72
+DA:87,72
+DA:88,72
+DA:94,396
+DA:98,396
+DA:100,396
+DA:101,396
+DA:104,396
+DA:105,204
+DA:107,396
+DA:108,192
+DA:110,396
+DA:116,0
+DA:120,0
+DA:122,0
+DA:123,0
+DA:126,0
+DA:129,0
+DA:130,0
+DA:131,0
+DA:134,0
+DA:135,0
+DA:136,0
+DA:138,0
+DA:139,0
+DA:145,396
+DA:149,396
+DA:151,396
+DA:152,396
+DA:155,396
+DA:156,396
+DA:157,396
+DA:158,396
+DA:159,396
+DA:165,0
+DA:169,0
+DA:171,0
+DA:172,0
+DA:175,0
+DA:176,0
+DA:177,0
+DA:178,0
+DA:180,0
+DA:181,0
+DA:187,396
+DA:191,396
+DA:192,396
+DA:195,396
+DA:196,396
+DA:199,396
+DA:200,396
+DA:201,396
+DA:202,396
+DA:208,0
+DA:212,0
+DA:213,0
+DA:216,0
+DA:218,0
+DA:221,0
+DA:222,0
+DA:223,0
+DA:224,0
+DA:230,144
+DA:235,144
+DA:237,144
+DA:238,144
+DA:242,144
+DA:243,0
+DA:245,0
+DA:246,0
+DA:247,0
+DA:250,144
+DA:251,72
+DA:252,72
+DA:254,144
+DA:255,24
+DA:256,24
+DA:258,144
+DA:259,48
+DA:261,144
+DA:267,72
+DA:271,72
+DA:272,72
+DA:273,0
+DA:274,0
+DA:276,72
+DA:277,0
+DA:278,0
+DA:280,72
+DA:281,48
+DA:283,72
+DA:284,72
+DA:290,72
+DA:294,72
+DA:295,72
+DA:296,72
+DA:297,72
+DA:298,72
+DA:299,72
+DA:300,72
+DA:301,72
+DA:302,72
+DA:303,72
+DA:304,72
+DA:305,72
+DA:306,72
+DA:307,72
+DA:308,72
+LF:145
+LH:92
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-restriction.c
+FNL:0,109,149
+FNA:0,0,CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core
+FNL:1,151,190
+FNA:1,0,CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memcheck_Core
+FNL:2,19,41
+FNA:2,192,CeedElemRestrictionGetBackendStrides_Memcheck
+FNL:3,192,216
+FNA:3,48,CeedElemRestrictionApplyStridedTranspose_Memcheck_Core
+FNL:4,218,240
+FNA:4,48,CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core
+FNL:5,242,264
+FNA:5,0,CeedElemRestrictionApplyOrientedTranspose_Memcheck_Core
+FNL:6,266,315
+FNA:6,0,CeedElemRestrictionApplyCurlOrientedTranspose_Memcheck_Core
+FNL:7,317,365
+FNA:7,0,CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Memcheck_Core
+FNL:8,367,390
+FNA:8,0,CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core
+FNL:9,392,500
+FNA:9,240,CeedElemRestrictionApply_Memcheck_Core
+FNL:10,46,70
+FNA:10,48,CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core
+FNL:11,505,516
+FNA:11,240,CeedElemRestrictionApply_Memcheck
+FNL:12,521,533
+FNA:12,0,CeedElemRestrictionApplyUnsigned_Memcheck
+FNL:13,538,550
+FNA:13,0,CeedElemRestrictionApplyUnoriented_Memcheck
+FNL:14,555,563
+FNA:14,0,CeedElemRestrictionApplyAtPointsInElement_Memcheck
+FNL:15,568,579
+FNA:15,0,CeedElemRestrictionApplyBlock_Memcheck
+FNL:16,584,593
+FNA:16,72,CeedElemRestrictionGetOffsets_Memcheck
+FNL:17,598,607
+FNA:17,0,CeedElemRestrictionGetOrientations_Memcheck
+FNL:18,612,621
+FNA:18,0,CeedElemRestrictionGetCurlOrientations_Memcheck
+FNL:19,626,635
+FNA:19,192,CeedElemRestrictionDestroy_Memcheck
+FNL:20,640,773
+FNA:20,264,CeedElemRestrictionCreate_Memcheck
+FNL:21,72,88
+FNA:21,96,CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core
+FNL:22,90,107
+FNA:22,0,CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core
+FNF:23
+FNH:10
+DA:19,192
+DA:22,192
+DA:23,192
+DA:24,192
+DA:26,192
+DA:27,192
+DA:28,192
+DA:40,192
+DA:46,48
+DA:52,48
+DA:54,48
+DA:55,48
+DA:56,0
+DA:59,432
+DA:60,768
+DA:61,9888
+DA:62,34464
+DA:63,24960
+DA:64,24960
+DA:69,48
+DA:72,96
+DA:79,96
+DA:80,864
+DA:81,1656
+DA:82,55608
+DA:83,54720
+DA:87,96
+DA:90,0
+DA:97,0
+DA:98,0
+DA:99,0
+DA:100,0
+DA:101,0
+DA:102,0
+DA:106,0
+DA:109,0
+DA:116,0
+DA:117,0
+DA:118,0
+DA:119,0
+DA:121,0
+DA:122,0
+DA:123,0
+DA:124,0
+DA:125,0
+DA:126,0
+DA:128,0
+DA:129,0
+DA:130,0
+DA:131,0
+DA:132,0
+DA:133,0
+DA:134,0
+DA:135,0
+DA:136,0
+DA:139,0
+DA:140,0
+DA:141,0
+DA:142,0
+DA:143,0
+DA:144,0
+DA:148,0
+DA:151,0
+DA:157,0
+DA:158,0
+DA:159,0
+DA:160,0
+DA:162,0
+DA:163,0
+DA:164,0
+DA:165,0
+DA:166,0
+DA:167,0
+DA:169,0
+DA:170,0
+DA:171,0
+DA:172,0
+DA:173,0
+DA:174,0
+DA:175,0
+DA:176,0
+DA:177,0
+DA:180,0
+DA:181,0
+DA:182,0
+DA:183,0
+DA:184,0
+DA:185,0
+DA:189,0
+DA:192,48
+DA:198,48
+DA:200,48
+DA:201,48
+DA:202,0
+DA:205,432
+DA:206,768
+DA:207,9888
+DA:208,24096
+DA:209,14592
+DA:210,14592
+DA:215,48
+DA:218,48
+DA:225,48
+DA:226,432
+DA:227,768
+DA:228,6624
+DA:230,16000
+DA:233,9760
+DA:234,9760
+DA:239,48
+DA:242,0
+DA:249,0
+DA:250,0
+DA:251,0
+DA:252,0
+DA:254,0
+DA:257,0
+DA:258,0
+DA:263,0
+DA:266,0
+DA:269,0
+DA:272,0
+DA:274,0
+DA:275,0
+DA:276,0
+DA:278,0
+DA:279,0
+DA:281,0
+DA:282,0
+DA:283,0
+DA:284,0
+DA:285,0
+DA:287,0
+DA:288,0
+DA:290,0
+DA:291,0
+DA:292,0
+DA:293,0
+DA:294,0
+DA:295,0
+DA:296,0
+DA:297,0
+DA:299,0
+DA:300,0
+DA:303,0
+DA:304,0
+DA:305,0
+DA:306,0
+DA:307,0
+DA:309,0
+DA:310,0
+DA:314,0
+DA:317,0
+DA:319,0
+DA:322,0
+DA:324,0
+DA:325,0
+DA:326,0
+DA:328,0
+DA:329,0
+DA:331,0
+DA:332,0
+DA:333,0
+DA:334,0
+DA:335,0
+DA:337,0
+DA:338,0
+DA:340,0
+DA:341,0
+DA:342,0
+DA:343,0
+DA:344,0
+DA:345,0
+DA:346,0
+DA:347,0
+DA:349,0
+DA:350,0
+DA:353,0
+DA:354,0
+DA:355,0
+DA:356,0
+DA:357,0
+DA:359,0
+DA:360,0
+DA:364,0
+DA:367,0
+DA:371,0
+DA:374,0
+DA:375,0
+DA:376,0
+DA:377,0
+DA:378,0
+DA:379,0
+DA:380,0
+DA:383,0
+DA:384,0
+DA:387,0
+DA:389,0
+DA:392,240
+DA:401,240
+DA:402,240
+DA:403,240
+DA:404,240
+DA:405,240
+DA:407,240
+DA:409,96
+DA:412,144
+DA:415,240
+DA:421,96
+DA:422,48
+DA:423,48
+DA:425,48
+DA:426,48
+DA:427,48
+DA:429,48
+DA:430,0
+DA:431,0
+DA:432,0
+DA:435,0
+DA:438,0
+DA:439,0
+DA:440,0
+DA:441,0
+DA:443,0
+DA:444,0
+DA:447,0
+DA:450,0
+DA:451,0
+DA:452,0
+DA:453,0
+DA:461,144
+DA:462,48
+DA:463,48
+DA:465,48
+DA:466,96
+DA:467,96
+DA:469,96
+DA:470,0
+DA:471,0
+DA:472,0
+DA:475,0
+DA:478,0
+DA:479,0
+DA:480,0
+DA:481,0
+DA:483,0
+DA:484,0
+DA:487,0
+DA:490,0
+DA:491,0
+DA:492,0
+DA:493,0
+DA:496,240
+DA:497,240
+DA:498,240
+DA:499,240
+DA:505,240
+DA:509,240
+DA:510,240
+DA:511,240
+DA:512,240
+DA:513,240
+DA:514,240
+DA:515,240
+DA:521,0
+DA:526,0
+DA:527,0
+DA:528,0
+DA:529,0
+DA:530,0
+DA:531,0
+DA:532,0
+DA:538,0
+DA:543,0
+DA:544,0
+DA:545,0
+DA:546,0
+DA:547,0
+DA:548,0
+DA:549,0
+DA:555,0
+DA:560,0
+DA:561,0
+DA:562,0
+DA:568,0
+DA:573,0
+DA:574,0
+DA:575,0
+DA:576,0
+DA:577,0
+DA:578,0
+DA:584,72
+DA:587,72
+DA:589,72
+DA:591,72
+DA:592,72
+DA:598,0
+DA:601,0
+DA:603,0
+DA:605,0
+DA:606,0
+DA:612,0
+DA:615,0
+DA:617,0
+DA:619,0
+DA:620,0
+DA:626,192
+DA:629,192
+DA:630,192
+DA:631,192
+DA:632,192
+DA:633,192
+DA:634,192
+DA:640,264
+DA:643,264
+DA:647,264
+DA:648,264
+DA:649,264
+DA:650,264
+DA:651,264
+DA:652,264
+DA:653,264
+DA:654,264
+DA:656,264
+DA:658,264
+DA:659,264
+DA:664,264
+DA:666,264
+DA:667,264
+DA:668,96
+DA:669,96
+DA:670,96
+DA:671,96
+DA:677,264
+DA:678,0
+DA:680,0
+DA:681,0
+DA:683,0
+DA:684,0
+DA:687,0
+DA:688,0
+DA:692,264
+DA:697,168
+DA:698,34328
+DA:699,34160
+DA:705,168
+DA:706,168
+DA:707,168
+DA:708,96
+DA:709,96
+DA:710,96
+DA:711,96
+DA:712,96
+DA:713,72
+DA:714,72
+DA:715,72
+DA:716,72
+DA:717,0
+DA:718,0
+DA:722,168
+DA:723,0
+DA:724,0
+DA:725,0
+DA:726,0
+DA:727,0
+DA:728,0
+DA:729,0
+DA:730,0
+DA:731,0
+DA:732,0
+DA:733,0
+DA:734,0
+DA:735,0
+DA:737,168
+DA:738,0
+DA:739,0
+DA:740,0
+DA:741,0
+DA:742,0
+DA:743,0
+DA:744,0
+DA:745,0
+DA:746,0
+DA:747,0
+DA:748,0
+DA:749,0
+DA:750,0
+DA:756,264
+DA:759,264
+DA:760,264
+DA:761,264
+DA:762,264
+DA:763,0
+DA:766,264
+DA:767,264
+DA:768,264
+DA:769,264
+DA:770,264
+DA:771,264
+DA:772,264
+LF:400
+LH:145
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-serial.c
+FNL:0,17,35
+FNA:0,24,CeedInit_Memcheck
+FNL:1,40,40
+FNA:1,192,CeedRegister_Memcheck_Serial
+FNF:2
+FNH:2
+DA:17,24
+DA:20,24
+DA:24,24
+DA:25,24
+DA:26,24
+DA:28,24
+DA:29,24
+DA:30,24
+DA:31,24
+DA:32,24
+DA:33,24
+DA:34,24
+DA:40,192
+LF:13
+LH:13
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/memcheck/ceed-memcheck-vector.c
+FNL:0,102,113
+FNA:0,144,CeedVectorSetValue_Memcheck
+FNL:1,118,130
+FNA:1,0,CeedVectorSetValueStrided_Memcheck
+FNL:2,135,152
+FNA:2,2352,CeedVectorSyncArray_Memcheck
+FNL:3,157,181
+FNA:3,0,CeedVectorTakeArray_Memcheck
+FNL:4,186,201
+FNA:4,2352,CeedVectorGetArray_Memcheck
+FNL:5,206,223
+FNA:5,3120,CeedVectorGetArrayRead_Memcheck
+FNL:6,21,27
+FNA:6,3240,CeedVectorHasValidArray_Memcheck
+FNL:7,228,247
+FNA:7,2232,CeedVectorGetArrayWrite_Memcheck
+FNL:8,252,279
+FNA:8,2352,CeedVectorRestoreArray_Memcheck
+FNL:9,284,301
+FNA:9,3120,CeedVectorRestoreArrayRead_Memcheck
+FNL:10,306,317
+FNA:10,0,CeedVectorReciprocal_Memcheck
+FNL:11,32,40
+FNA:11,0,CeedVectorHasBorrowedArrayOfType_Memcheck
+FNL:12,322,331
+FNA:12,0,CeedVectorScale_Memcheck
+FNL:13,336,346
+FNA:13,0,CeedVectorAXPY_Memcheck
+FNL:14,351,361
+FNA:14,0,CeedVectorAXPBY_Memcheck
+FNL:15,366,379
+FNA:15,0,CeedVectorPointwiseMult_Memcheck
+FNL:16,384,402
+FNA:16,864,CeedVectorDestroy_Memcheck
+FNL:17,407,434
+FNA:17,864,CeedVectorCreate_Memcheck
+FNL:18,45,97
+FNA:18,2544,CeedVectorSetArray_Memcheck
+FNF:19
+FNH:11
+DA:21,3240
+DA:24,3240
+DA:25,3240
+DA:26,3240
+DA:32,0
+DA:35,0
+DA:37,0
+DA:38,0
+DA:39,0
+DA:45,2544
+DA:49,2544
+DA:51,2544
+DA:52,2544
+DA:55,2544
+DA:56,31648
+DA:57,1680
+DA:59,2544
+DA:60,2544
+DA:61,1968
+DA:62,0
+DA:63,0
+DA:65,1968
+DA:69,2544
+DA:72,2544
+DA:73,576
+DA:75,576
+DA:76,0
+DA:77,0
+DA:78,0
+DA:79,0
+DA:80,0
+DA:81,1968
+DA:82,1968
+DA:83,1968
+DA:84,1968
+DA:85,1968
+DA:89,2544
+DA:90,2544
+DA:91,2544
+DA:92,1992
+DA:94,359032
+DA:96,2544
+DA:102,144
+DA:106,144
+DA:107,144
+DA:109,144
+DA:110,144
+DA:111,30960
+DA:112,144
+DA:118,0
+DA:122,0
+DA:123,0
+DA:125,0
+DA:126,0
+DA:127,0
+DA:128,0
+DA:129,0
+DA:135,2352
+DA:139,2352
+DA:141,2352
+DA:142,2352
+DA:145,2352
+DA:146,0
+DA:148,2352
+DA:149,792
+DA:151,2352
+DA:157,0
+DA:161,0
+DA:163,0
+DA:164,0
+DA:167,0
+DA:170,0
+DA:171,0
+DA:172,0
+DA:175,0
+DA:176,0
+DA:177,0
+DA:179,0
+DA:180,0
+DA:186,2352
+DA:190,2352
+DA:192,2352
+DA:193,2352
+DA:196,2352
+DA:197,2352
+DA:198,2352
+DA:199,2352
+DA:200,2352
+DA:206,3120
+DA:210,3120
+DA:212,3120
+DA:213,3120
+DA:216,3120
+DA:217,3120
+DA:218,3120
+DA:219,3120
+DA:221,3120
+DA:222,3120
+DA:228,2232
+DA:232,2232
+DA:234,2232
+DA:235,2232
+DA:238,2232
+DA:241,2232
+DA:244,417752
+DA:245,2232
+DA:246,2232
+DA:252,2352
+DA:256,2352
+DA:257,2352
+DA:260,2352
+DA:261,417752
+DA:262,415520
+DA:263,0
+DA:267,2232
+DA:271,2352
+DA:272,2352
+DA:275,449456
+DA:276,2352
+DA:277,2352
+DA:278,2352
+DA:284,3120
+DA:288,3120
+DA:289,3120
+DA:292,3120
+DA:294,3120
+DA:297,503680
+DA:298,3120
+DA:299,3120
+DA:300,3120
+DA:306,0
+DA:310,0
+DA:311,0
+DA:313,0
+DA:314,0
+DA:316,0
+DA:322,0
+DA:326,0
+DA:327,0
+DA:329,0
+DA:330,0
+DA:336,0
+DA:340,0
+DA:341,0
+DA:342,0
+DA:344,0
+DA:345,0
+DA:351,0
+DA:355,0
+DA:356,0
+DA:357,0
+DA:359,0
+DA:360,0
+DA:366,0
+DA:370,0
+DA:371,0
+DA:372,0
+DA:373,0
+DA:375,0
+DA:376,0
+DA:377,0
+DA:378,0
+DA:384,864
+DA:388,864
+DA:389,864
+DA:390,864
+DA:391,864
+DA:393,864
+DA:394,0
+DA:395,0
+DA:397,864
+DA:398,288
+DA:400,864
+DA:401,864
+DA:407,864
+DA:411,864
+DA:412,864
+DA:413,864
+DA:414,864
+DA:415,864
+DA:416,864
+DA:417,864
+DA:418,864
+DA:419,864
+DA:420,864
+DA:421,864
+DA:422,864
+DA:423,864
+DA:424,864
+DA:425,864
+DA:426,864
+DA:427,864
+DA:428,864
+DA:429,864
+DA:430,864
+DA:431,864
+DA:432,864
+DA:433,864
+LF:198
+LH:129
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/opt/ceed-opt-blocked.c
+FNL:0,18,24
+FNA:0,48,CeedDestroy_Opt
+FNL:1,29,51
+FNA:1,48,CeedInit_Opt_Blocked
+FNL:2,56,56
+FNA:2,192,CeedRegister_Opt_Blocked
+FNF:3
+FNH:3
+DA:18,48
+DA:21,48
+DA:22,48
+DA:23,48
+DA:29,48
+DA:33,48
+DA:35,48
+DA:38,48
+DA:39,48
+DA:40,48
+DA:42,48
+DA:43,48
+DA:44,48
+DA:47,48
+DA:48,48
+DA:49,48
+DA:50,48
+DA:56,192
+LF:18
+LH:18
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/opt/ceed-opt-operator.c
+FNL:0,19,200
+FNA:0,384,CeedOperatorSetupFields_Opt
+FNL:1,205,272
+FNA:1,192,CeedOperatorSetup_Opt
+FNL:2,277,312
+FNA:2,192,CeedOperatorSetupInputs_Opt
+FNL:3,317,368
+FNA:3,1536,CeedOperatorInputBasis_Opt
+FNL:4,373,418
+FNA:4,1536,CeedOperatorOutputBasis_Opt
+FNL:5,423,437
+FNA:5,192,CeedOperatorRestoreInputs_Opt
+FNL:6,442,513
+FNA:6,192,CeedOperatorApplyAdd_Opt
+FNL:7,518,726
+FNA:7,0,CeedOperatorLinearAssembleQFunctionCore_Opt
+FNL:8,731,733
+FNA:8,0,CeedOperatorLinearAssembleQFunction_Opt
+FNL:9,738,740
+FNA:9,0,CeedOperatorLinearAssembleQFunctionUpdate_Opt
+FNL:10,745,780
+FNA:10,192,CeedOperatorDestroy_Opt
+FNL:11,785,805
+FNA:11,192,CeedOperatorCreate_Opt
+FNF:12
+FNH:9
+DA:19,384
+DA:31,384
+DA:32,384
+DA:33,384
+DA:34,384
+DA:36,384
+DA:37,192
+DA:38,192
+DA:40,192
+DA:41,192
+DA:45,960
+DA:49,576
+DA:50,576
+DA:57,480
+DA:58,480
+DA:59,480
+DA:60,480
+DA:61,480
+DA:62,480
+DA:63,480
+DA:65,480
+DA:66,480
+DA:67,288
+DA:68,288
+DA:70,288
+DA:71,288
+DA:73,288
+DA:74,288
+DA:75,0
+DA:76,0
+DA:77,0
+DA:79,0
+DA:80,0
+DA:81,0
+DA:83,0
+DA:84,0
+DA:85,0
+DA:86,0
+DA:87,0
+DA:88,0
+DA:90,0
+DA:91,0
+DA:92,0
+DA:95,0
+DA:96,0
+DA:97,0
+DA:98,192
+DA:101,192
+DA:102,192
+DA:104,192
+DA:105,0
+DA:107,0
+DA:109,480
+DA:110,480
+DA:111,480
+DA:114,576
+DA:115,192
+DA:116,192
+DA:117,192
+DA:118,192
+DA:119,192
+DA:120,192
+DA:121,192
+DA:122,288
+DA:126,288
+DA:127,288
+DA:128,288
+DA:129,288
+DA:130,288
+DA:131,288
+DA:132,288
+DA:133,288
+DA:134,288
+DA:135,288
+DA:136,96
+DA:137,96
+DA:138,96
+DA:139,96
+DA:140,96
+DA:141,96
+DA:142,96
+DA:145,576
+DA:148,384
+DA:149,576
+DA:153,384
+DA:154,384
+DA:155,576
+DA:159,192
+DA:160,192
+DA:161,192
+DA:162,0
+DA:163,0
+DA:164,0
+DA:166,192
+DA:167,192
+DA:169,384
+DA:170,384
+DA:173,384
+DA:177,192
+DA:178,192
+DA:179,192
+DA:183,0
+DA:184,0
+DA:185,0
+DA:186,0
+DA:187,0
+DA:188,0
+DA:189,0
+DA:191,0
+DA:192,0
+DA:194,192
+DA:195,192
+DA:198,384
+DA:199,384
+DA:205,192
+DA:215,192
+DA:216,192
+DA:218,192
+DA:219,192
+DA:220,192
+DA:221,192
+DA:222,192
+DA:223,192
+DA:224,192
+DA:225,192
+DA:226,192
+DA:227,192
+DA:230,192
+DA:231,192
+DA:233,192
+DA:234,192
+DA:235,192
+DA:236,192
+DA:237,192
+DA:238,192
+DA:239,192
+DA:240,192
+DA:242,192
+DA:243,192
+DA:247,192
+DA:250,192
+DA:254,192
+DA:258,0
+DA:259,0
+DA:260,0
+DA:262,0
+DA:263,0
+DA:265,0
+DA:269,192
+DA:270,192
+DA:271,192
+DA:277,192
+DA:280,576
+DA:283,384
+DA:284,384
+DA:290,288
+DA:291,288
+DA:293,96
+DA:294,96
+DA:295,96
+DA:297,96
+DA:299,96
+DA:302,192
+DA:303,0
+DA:304,0
+DA:305,0
+DA:308,288
+DA:311,192
+DA:317,1536
+DA:320,4608
+DA:329,3072
+DA:330,3072
+DA:331,3072
+DA:332,3072
+DA:335,3072
+DA:336,3072
+DA:337,3072
+DA:338,3072
+DA:339,3072
+DA:341,3072
+DA:342,1536
+DA:345,3072
+DA:346,768
+DA:347,768
+DA:348,768
+DA:350,768
+DA:351,1536
+DA:355,1536
+DA:356,1536
+DA:357,0
+DA:358,0
+DA:360,1536
+DA:361,1536
+DA:362,1536
+DA:363,768
+DA:364,768
+DA:367,1536
+DA:373,1536
+DA:376,3072
+DA:383,1536
+DA:385,1536
+DA:386,768
+DA:387,768
+DA:388,768
+DA:392,768
+DA:393,768
+DA:394,0
+DA:396,768
+DA:398,768
+DA:399,768
+DA:407,1536
+DA:409,1536
+DA:410,1536
+DA:411,1536
+DA:413,1536
+DA:415,1536
+DA:417,1536
+DA:423,192
+DA:425,576
+DA:429,384
+DA:430,384
+DA:431,384
+DA:432,96
+DA:434,384
+DA:436,192
+DA:442,192
+DA:447,192
+DA:454,192
+DA:456,192
+DA:457,192
+DA:458,192
+DA:459,192
+DA:460,192
+DA:461,192
+DA:462,192
+DA:465,192
+DA:466,0
+DA:467,0
+DA:468,0
+DA:470,0
+DA:473,192
+DA:474,192
+DA:475,192
+DA:476,192
+DA:479,192
+DA:482,384
+DA:484,192
+DA:485,192
+DA:487,96
+DA:488,96
+DA:489,96
+DA:494,1728
+DA:496,1536
+DA:500,1536
+DA:501,1536
+DA:505,1536
+DA:510,192
+DA:511,192
+DA:512,192
+DA:518,0
+DA:523,0
+DA:529,0
+DA:530,0
+DA:531,0
+DA:532,0
+DA:533,0
+DA:535,0
+DA:536,0
+DA:537,0
+DA:538,0
+DA:539,0
+DA:540,0
+DA:541,0
+DA:542,0
+DA:543,0
+DA:546,0
+DA:549,0
+DA:552,0
+DA:555,0
+DA:556,0
+DA:561,0
+DA:562,0
+DA:563,0
+DA:564,0
+DA:565,0
+DA:567,0
+DA:569,0
+DA:570,0
+DA:574,0
+DA:575,0
+DA:580,0
+DA:581,0
+DA:582,0
+DA:583,0
+DA:585,0
+DA:587,0
+DA:588,0
+DA:592,0
+DA:593,0
+DA:595,0
+DA:596,0
+DA:597,0
+DA:601,0
+DA:602,0
+DA:604,0
+DA:606,0
+DA:610,0
+DA:611,0
+DA:612,0
+DA:615,0
+DA:618,0
+DA:622,0
+DA:623,0
+DA:624,0
+DA:627,0
+DA:631,0
+DA:637,0
+DA:638,0
+DA:639,0
+DA:640,0
+DA:641,0
+DA:642,0
+DA:647,0
+DA:648,0
+DA:649,0
+DA:652,0
+DA:654,0
+DA:658,0
+DA:659,0
+DA:662,0
+DA:663,0
+DA:664,0
+DA:666,0
+DA:669,0
+DA:675,0
+DA:676,0
+DA:677,0
+DA:678,0
+DA:679,0
+DA:685,0
+DA:686,0
+DA:687,0
+DA:693,0
+DA:694,0
+DA:698,0
+DA:699,0
+DA:700,0
+DA:702,0
+DA:707,0
+DA:708,0
+DA:712,0
+DA:716,0
+DA:717,0
+DA:718,0
+DA:722,0
+DA:723,0
+DA:724,0
+DA:725,0
+DA:731,0
+DA:732,0
+DA:738,0
+DA:739,0
+DA:745,192
+DA:748,192
+DA:749,768
+DA:750,576
+DA:751,576
+DA:753,192
+DA:754,192
+DA:755,192
+DA:756,192
+DA:757,192
+DA:758,192
+DA:760,576
+DA:761,384
+DA:762,384
+DA:764,192
+DA:765,192
+DA:767,384
+DA:768,192
+DA:769,192
+DA:771,192
+DA:772,192
+DA:775,192
+DA:776,192
+DA:778,192
+DA:779,192
+DA:785,192
+DA:790,192
+DA:791,192
+DA:792,192
+DA:794,192
+DA:795,192
+DA:797,192
+DA:799,192
+DA:800,192
+DA:801,192
+DA:802,192
+DA:803,192
+DA:804,192
+LF:400
+LH:249
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/opt/ceed-opt-serial.c
+FNL:0,18,24
+FNA:0,48,CeedDestroy_Opt
+FNL:1,29,51
+FNA:1,48,CeedInit_Opt_Serial
+FNL:2,56,56
+FNA:2,192,CeedRegister_Opt_Serial
+FNF:3
+FNH:3
+DA:18,48
+DA:21,48
+DA:22,48
+DA:23,48
+DA:29,48
+DA:33,48
+DA:35,48
+DA:38,48
+DA:39,48
+DA:40,48
+DA:42,48
+DA:43,48
+DA:44,48
+DA:47,48
+DA:48,48
+DA:49,48
+DA:50,48
+DA:56,192
+LF:18
+LH:18
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/opt/ceed-opt-tensor.c
+FNL:0,16,35
+FNA:0,2016,CeedTensorContractApply_Core_Opt
+FNL:1,40,49
+FNA:1,2016,CeedTensorContractApply_Opt
+FNL:2,54,57
+FNA:2,96,CeedTensorContractCreate_Opt
+FNF:3
+FNH:3
+DA:16,2016
+DA:19,2016
+DA:21,2016
+DA:22,504
+DA:23,504
+DA:26,13080
+DA:27,72600
+DA:28,419808
+DA:29,358272
+DA:30,2727456
+DA:34,2016
+DA:40,2016
+DA:42,2016
+DA:43,415104
+DA:46,2016
+DA:47,672
+DA:54,96
+DA:55,96
+DA:56,96
+LF:19
+LH:19
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-basis.c
+FNL:0,19,251
+FNA:0,4800,CeedBasisApplyCore_Ref
+FNL:1,253,256
+FNA:1,4800,CeedBasisApply_Ref
+FNL:2,258,261
+FNA:2,0,CeedBasisApplyAdd_Ref
+FNL:3,266,273
+FNA:3,384,CeedBasisDestroyTensor_Ref
+FNL:4,278,306
+FNA:4,384,CeedBasisCreateTensorH1_Ref
+FNL:5,311,328
+FNA:5,0,CeedBasisCreateH1_Ref
+FNL:6,333,350
+FNA:6,0,CeedBasisCreateHdiv_Ref
+FNL:7,355,372
+FNA:7,0,CeedBasisCreateHcurl_Ref
+FNF:8
+FNH:4
+DA:19,4800
+DA:21,4800
+DA:28,4800
+DA:29,4800
+DA:30,4800
+DA:31,4800
+DA:32,4800
+DA:33,4800
+DA:34,4800
+DA:35,4800
+DA:36,192
+DA:38,4800
+DA:39,4800
+DA:41,4800
+DA:44,1536
+DA:45,64576
+DA:48,4800
+DA:49,4800
+DA:53,4800
+DA:54,4800
+DA:55,4800
+DA:57,3072
+DA:58,3072
+DA:59,0
+DA:60,3072
+DA:61,3072
+DA:63,3072
+DA:64,1536
+DA:65,1536
+DA:67,3072
+DA:68,3072
+DA:71,3072
+DA:72,7104
+DA:73,4032
+DA:75,4032
+DA:76,4032
+DA:79,3072
+DA:81,1536
+DA:86,1536
+DA:88,1536
+DA:89,0
+DA:90,0
+DA:92,1536
+DA:95,1536
+DA:96,1536
+DA:97,1536
+DA:98,1536
+DA:102,3552
+DA:103,2016
+DA:107,2016
+DA:108,2016
+DA:112,1536
+DA:113,1536
+DA:114,0
+DA:115,0
+DA:117,1536
+DA:118,3552
+DA:119,2016
+DA:124,2016
+DA:125,2016
+DA:127,0
+DA:130,0
+DA:133,0
+DA:135,0
+DA:136,0
+DA:139,0
+DA:140,0
+DA:142,0
+DA:145,0
+DA:147,0
+DA:148,0
+DA:149,0
+DA:151,0
+DA:154,0
+DA:155,0
+DA:157,0
+DA:158,0
+DA:162,0
+DA:163,0
+DA:167,1536
+DA:169,192
+DA:170,192
+DA:173,192
+DA:174,192
+DA:175,576
+DA:176,384
+DA:178,3648
+DA:179,22848
+DA:180,66048
+DA:181,46464
+DA:183,255552
+DA:188,192
+DA:199,0
+DA:201,0
+DA:203,0
+DA:206,0
+DA:207,0
+DA:208,0
+DA:210,0
+DA:213,0
+DA:214,0
+DA:215,0
+DA:217,0
+DA:220,0
+DA:221,0
+DA:222,0
+DA:224,0
+DA:227,0
+DA:228,0
+DA:229,0
+DA:231,0
+DA:234,0
+DA:235,0
+DA:236,0
+DA:237,0
+DA:239,0
+DA:246,4800
+DA:247,4608
+DA:249,4800
+DA:250,4800
+DA:253,4800
+DA:254,4800
+DA:255,4800
+DA:258,0
+DA:259,0
+DA:260,0
+DA:266,384
+DA:269,384
+DA:270,384
+DA:271,384
+DA:272,384
+DA:278,384
+DA:284,384
+DA:285,384
+DA:287,384
+DA:289,384
+DA:290,384
+DA:291,384
+DA:292,384
+DA:294,384
+DA:296,384
+DA:297,384
+DA:298,384
+DA:300,384
+DA:301,384
+DA:302,384
+DA:303,384
+DA:304,384
+DA:305,384
+DA:311,0
+DA:316,0
+DA:317,0
+DA:319,0
+DA:320,0
+DA:321,0
+DA:323,0
+DA:324,0
+DA:325,0
+DA:326,0
+DA:327,0
+DA:333,0
+DA:338,0
+DA:339,0
+DA:341,0
+DA:342,0
+DA:343,0
+DA:345,0
+DA:346,0
+DA:347,0
+DA:348,0
+DA:349,0
+DA:355,0
+DA:360,0
+DA:361,0
+DA:363,0
+DA:364,0
+DA:365,0
+DA:367,0
+DA:368,0
+DA:369,0
+DA:370,0
+DA:371,0
+LF:182
+LH:98
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-operator.c
+FNL:0,1025,1084
+FNA:0,0,CeedOperatorApplyAddAtPoints_Ref
+FNL:1,1089,1310
+FNA:1,0,CeedOperatorLinearAssembleQFunctionAtPointsCore_Ref
+FNL:2,1315,1317
+FNA:2,0,CeedOperatorLinearAssembleQFunctionAtPoints_Ref
+FNL:3,1322,1325
+FNA:3,0,CeedOperatorLinearAssembleQFunctionAtPointsUpdate_Ref
+FNL:4,1330,1528
+FNA:4,0,CeedOperatorLinearAssembleAddDiagonalAtPoints_Ref
+FNL:5,145,206
+FNA:5,96,CeedOperatorSetup_Ref
+FNL:6,1533,1735
+FNA:6,0,CeedOperatorAssembleSingleAtPoints_Ref
+FNL:7,1740,1771
+FNA:7,96,CeedOperatorDestroy_Ref
+FNL:8,1776,1789
+FNA:8,96,CeedOperatorCreate_Ref
+FNL:9,1794,1810
+FNA:9,0,CeedOperatorCreateAtPoints_Ref
+FNL:10,19,140
+FNA:10,192,CeedOperatorSetupFields_Ref
+FNL:11,211,249
+FNA:11,96,CeedOperatorSetupInputs_Ref
+FNL:12,254,299
+FNA:12,1344,CeedOperatorInputBasis_Ref
+FNL:13,304,345
+FNA:13,1344,CeedOperatorOutputBasis_Ref
+FNL:14,350,373
+FNA:14,96,CeedOperatorRestoreInputs_Ref
+FNL:15,378,473
+FNA:15,96,CeedOperatorApplyAdd_Ref
+FNL:16,478,654
+FNA:16,0,CeedOperatorLinearAssembleQFunctionCore_Ref
+FNL:17,659,661
+FNA:17,0,CeedOperatorLinearAssembleQFunction_Ref
+FNL:18,666,668
+FNA:18,0,CeedOperatorLinearAssembleQFunctionUpdate_Ref
+FNL:19,673,827
+FNA:19,0,CeedOperatorSetupFieldsAtPoints_Ref
+FNL:20,832,882
+FNA:20,0,CeedOperatorSetupAtPoints_Ref
+FNL:21,887,949
+FNA:21,0,CeedOperatorInputBasisAtPoints_Ref
+FNL:22,954,1020
+FNA:22,0,CeedOperatorOutputBasisAtPoints_Ref
+FNF:23
+FNH:9
+DA:19,192
+DA:31,192
+DA:32,192
+DA:33,192
+DA:34,192
+DA:36,192
+DA:37,96
+DA:38,96
+DA:40,96
+DA:41,96
+DA:45,480
+DA:50,288
+DA:51,288
+DA:52,240
+DA:53,240
+DA:54,240
+DA:57,288
+DA:58,96
+DA:59,96
+DA:60,96
+DA:61,96
+DA:62,96
+DA:63,144
+DA:67,144
+DA:68,144
+DA:69,144
+DA:70,144
+DA:71,144
+DA:72,144
+DA:73,144
+DA:74,144
+DA:75,144
+DA:76,144
+DA:77,48
+DA:78,48
+DA:79,48
+DA:80,48
+DA:81,48
+DA:82,48
+DA:83,48
+DA:87,192
+DA:88,288
+DA:92,192
+DA:93,192
+DA:94,288
+DA:98,96
+DA:99,96
+DA:100,96
+DA:101,0
+DA:102,0
+DA:103,0
+DA:105,96
+DA:106,96
+DA:108,192
+DA:109,192
+DA:112,192
+DA:116,96
+DA:117,96
+DA:118,96
+DA:122,0
+DA:123,0
+DA:124,0
+DA:125,0
+DA:126,0
+DA:127,0
+DA:128,0
+DA:129,0
+DA:131,0
+DA:132,0
+DA:134,96
+DA:135,96
+DA:138,192
+DA:139,192
+DA:145,96
+DA:153,96
+DA:154,96
+DA:156,96
+DA:157,96
+DA:158,96
+DA:159,96
+DA:160,96
+DA:161,96
+DA:164,96
+DA:166,96
+DA:167,96
+DA:168,96
+DA:169,96
+DA:170,96
+DA:171,96
+DA:172,96
+DA:173,96
+DA:174,96
+DA:176,96
+DA:177,96
+DA:181,96
+DA:184,96
+DA:188,96
+DA:192,0
+DA:193,0
+DA:194,0
+DA:196,0
+DA:197,0
+DA:199,0
+DA:203,96
+DA:204,96
+DA:205,96
+DA:211,96
+DA:214,288
+DA:221,192
+DA:222,192
+DA:223,192
+DA:224,96
+DA:225,96
+DA:228,192
+DA:230,192
+DA:233,144
+DA:235,144
+DA:238,144
+DA:239,144
+DA:240,144
+DA:242,144
+DA:244,144
+DA:246,192
+DA:248,96
+DA:254,1344
+DA:257,4032
+DA:264,2688
+DA:268,0
+DA:269,0
+DA:270,0
+DA:271,0
+DA:274,2688
+DA:275,2688
+DA:276,2688
+DA:277,2688
+DA:278,2688
+DA:280,2688
+DA:281,672
+DA:282,672
+DA:283,672
+DA:284,1344
+DA:288,1344
+DA:289,1344
+DA:290,1344
+DA:291,1344
+DA:292,1344
+DA:293,1344
+DA:294,672
+DA:295,672
+DA:298,1344
+DA:304,1344
+DA:307,2688
+DA:314,1344
+DA:315,1344
+DA:316,1344
+DA:317,1344
+DA:319,1344
+DA:320,672
+DA:321,672
+DA:322,672
+DA:326,672
+DA:327,672
+DA:328,672
+DA:330,672
+DA:331,0
+DA:333,672
+DA:335,672
+DA:336,672
+DA:344,1344
+DA:350,96
+DA:352,288
+DA:356,192
+DA:360,0
+DA:361,0
+DA:362,0
+DA:363,0
+DA:366,192
+DA:367,192
+DA:369,144
+DA:372,96
+DA:378,96
+DA:381,96
+DA:388,96
+DA:390,96
+DA:391,96
+DA:394,96
+DA:397,0
+DA:398,0
+DA:399,0
+DA:400,0
+DA:401,0
+DA:402,0
+DA:403,0
+DA:406,96
+DA:407,96
+DA:408,96
+DA:409,96
+DA:412,96
+DA:415,192
+DA:416,96
+DA:417,0
+DA:419,96
+DA:424,1440
+DA:426,2688
+DA:427,1344
+DA:428,1344
+DA:429,672
+DA:430,672
+DA:436,1344
+DA:439,1344
+DA:440,1344
+DA:444,1344
+DA:449,192
+DA:454,96
+DA:456,96
+DA:458,96
+DA:460,96
+DA:461,96
+DA:463,96
+DA:464,96
+DA:465,96
+DA:466,96
+DA:470,96
+DA:471,96
+DA:472,96
+DA:478,0
+DA:482,0
+DA:488,0
+DA:489,0
+DA:490,0
+DA:491,0
+DA:492,0
+DA:493,0
+DA:494,0
+DA:495,0
+DA:496,0
+DA:499,0
+DA:502,0
+DA:505,0
+DA:508,0
+DA:509,0
+DA:514,0
+DA:516,0
+DA:517,0
+DA:518,0
+DA:519,0
+DA:521,0
+DA:523,0
+DA:524,0
+DA:528,0
+DA:529,0
+DA:534,0
+DA:536,0
+DA:537,0
+DA:538,0
+DA:540,0
+DA:542,0
+DA:543,0
+DA:547,0
+DA:548,0
+DA:549,0
+DA:552,0
+DA:555,0
+DA:558,0
+DA:559,0
+DA:562,0
+DA:564,0
+DA:568,0
+DA:574,0
+DA:575,0
+DA:576,0
+DA:577,0
+DA:578,0
+DA:579,0
+DA:584,0
+DA:585,0
+DA:586,0
+DA:589,0
+DA:591,0
+DA:595,0
+DA:597,0
+DA:600,0
+DA:601,0
+DA:602,0
+DA:604,0
+DA:607,0
+DA:613,0
+DA:614,0
+DA:615,0
+DA:616,0
+DA:617,0
+DA:623,0
+DA:624,0
+DA:625,0
+DA:632,0
+DA:633,0
+DA:637,0
+DA:639,0
+DA:640,0
+DA:642,0
+DA:647,0
+DA:650,0
+DA:651,0
+DA:652,0
+DA:653,0
+DA:659,0
+DA:660,0
+DA:666,0
+DA:667,0
+DA:673,0
+DA:685,0
+DA:686,0
+DA:687,0
+DA:688,0
+DA:690,0
+DA:691,0
+DA:692,0
+DA:694,0
+DA:695,0
+DA:701,0
+DA:704,0
+DA:705,0
+DA:706,0
+DA:707,0
+DA:708,0
+DA:709,0
+DA:710,0
+DA:711,0
+DA:716,0
+DA:720,0
+DA:721,0
+DA:724,0
+DA:725,0
+DA:726,0
+DA:727,0
+DA:730,0
+DA:731,0
+DA:734,0
+DA:735,0
+DA:736,0
+DA:737,0
+DA:738,0
+DA:739,0
+DA:741,0
+DA:742,0
+DA:744,0
+DA:745,0
+DA:747,0
+DA:751,0
+DA:752,0
+DA:753,0
+DA:754,0
+DA:755,0
+DA:756,0
+DA:757,0
+DA:758,0
+DA:759,0
+DA:760,0
+DA:761,0
+DA:762,0
+DA:763,0
+DA:764,0
+DA:765,0
+DA:767,0
+DA:768,0
+DA:771,0
+DA:772,0
+DA:775,0
+DA:776,0
+DA:780,0
+DA:781,0
+DA:782,0
+DA:786,0
+DA:787,0
+DA:788,0
+DA:789,0
+DA:790,0
+DA:791,0
+DA:793,0
+DA:794,0
+DA:796,0
+DA:797,0
+DA:800,0
+DA:804,0
+DA:805,0
+DA:806,0
+DA:810,0
+DA:811,0
+DA:812,0
+DA:813,0
+DA:814,0
+DA:815,0
+DA:816,0
+DA:818,0
+DA:819,0
+DA:821,0
+DA:822,0
+DA:825,0
+DA:826,0
+DA:832,0
+DA:840,0
+DA:841,0
+DA:843,0
+DA:844,0
+DA:845,0
+DA:846,0
+DA:847,0
+DA:848,0
+DA:851,0
+DA:853,0
+DA:854,0
+DA:855,0
+DA:856,0
+DA:857,0
+DA:858,0
+DA:859,0
+DA:860,0
+DA:862,0
+DA:863,0
+DA:867,0
+DA:870,0
+DA:874,0
+DA:875,0
+DA:876,0
+DA:879,0
+DA:880,0
+DA:881,0
+DA:887,0
+DA:891,0
+DA:901,0
+DA:902,0
+DA:903,0
+DA:904,0
+DA:905,0
+DA:908,0
+DA:909,0
+DA:910,0
+DA:911,0
+DA:914,0
+DA:915,0
+DA:916,0
+DA:918,0
+DA:922,0
+DA:923,0
+DA:924,0
+DA:925,0
+DA:927,0
+DA:929,0
+DA:933,0
+DA:934,0
+DA:935,0
+DA:936,0
+DA:937,0
+DA:939,0
+DA:941,0
+DA:942,0
+DA:943,0
+DA:944,0
+DA:946,0
+DA:948,0
+DA:954,0
+DA:958,0
+DA:967,0
+DA:968,0
+DA:969,0
+DA:970,0
+DA:973,0
+DA:974,0
+DA:976,0
+DA:977,0
+DA:978,0
+DA:979,0
+DA:983,0
+DA:984,0
+DA:985,0
+DA:988,0
+DA:991,0
+DA:992,0
+DA:1001,0
+DA:1002,0
+DA:1003,0
+DA:1007,0
+DA:1008,0
+DA:1009,0
+DA:1011,0
+DA:1012,0
+DA:1014,0
+DA:1016,0
+DA:1017,0
+DA:1019,0
+DA:1025,0
+DA:1026,0
+DA:1027,0
+DA:1028,0
+DA:1029,0
+DA:1035,0
+DA:1036,0
+DA:1037,0
+DA:1038,0
+DA:1039,0
+DA:1042,0
+DA:1045,0
+DA:1048,0
+DA:1051,0
+DA:1055,0
+DA:1056,0
+DA:1057,0
+DA:1060,0
+DA:1064,0
+DA:1065,0
+DA:1069,0
+DA:1073,0
+DA:1077,0
+DA:1080,0
+DA:1081,0
+DA:1082,0
+DA:1083,0
+DA:1089,0
+DA:1092,0
+DA:1093,0
+DA:1094,0
+DA:1099,0
+DA:1101,0
+DA:1102,0
+DA:1103,0
+DA:1104,0
+DA:1105,0
+DA:1106,0
+DA:1107,0
+DA:1108,0
+DA:1111,0
+DA:1114,0
+DA:1117,0
+DA:1118,0
+DA:1121,0
+DA:1124,0
+DA:1125,0
+DA:1130,0
+DA:1132,0
+DA:1136,0
+DA:1138,0
+DA:1139,0
+DA:1140,0
+DA:1141,0
+DA:1144,0
+DA:1145,0
+DA:1147,0
+DA:1149,0
+DA:1150,0
+DA:1154,0
+DA:1155,0
+DA:1160,0
+DA:1162,0
+DA:1166,0
+DA:1168,0
+DA:1169,0
+DA:1170,0
+DA:1171,0
+DA:1174,0
+DA:1175,0
+DA:1176,0
+DA:1178,0
+DA:1180,0
+DA:1181,0
+DA:1185,0
+DA:1189,0
+DA:1192,0
+DA:1193,0
+DA:1195,0
+DA:1198,0
+DA:1201,0
+DA:1202,0
+DA:1205,0
+DA:1209,0
+DA:1210,0
+DA:1211,0
+DA:1214,0
+DA:1218,0
+DA:1224,0
+DA:1225,0
+DA:1226,0
+DA:1228,0
+DA:1230,0
+DA:1231,0
+DA:1236,0
+DA:1237,0
+DA:1238,0
+DA:1241,0
+DA:1243,0
+DA:1248,0
+DA:1250,0
+DA:1251,0
+DA:1252,0
+DA:1253,0
+DA:1255,0
+DA:1258,0
+DA:1264,0
+DA:1265,0
+DA:1266,0
+DA:1267,0
+DA:1268,0
+DA:1274,0
+DA:1275,0
+DA:1276,0
+DA:1280,0
+DA:1284,0
+DA:1285,0
+DA:1289,0
+DA:1291,0
+DA:1292,0
+DA:1294,0
+DA:1299,0
+DA:1302,0
+DA:1305,0
+DA:1306,0
+DA:1307,0
+DA:1308,0
+DA:1309,0
+DA:1315,0
+DA:1316,0
+DA:1322,0
+DA:1324,0
+DA:1330,0
+DA:1331,0
+DA:1332,0
+DA:1334,0
+DA:1335,0
+DA:1341,0
+DA:1342,0
+DA:1343,0
+DA:1344,0
+DA:1345,0
+DA:1348,0
+DA:1354,0
+DA:1355,0
+DA:1356,0
+DA:1357,0
+DA:1361,0
+DA:1367,0
+DA:1368,0
+DA:1369,0
+DA:1370,0
+DA:1374,0
+DA:1378,0
+DA:1379,0
+DA:1380,0
+DA:1381,0
+DA:1382,0
+DA:1386,0
+DA:1389,0
+DA:1390,0
+DA:1393,0
+DA:1394,0
+DA:1395,0
+DA:1398,0
+DA:1402,0
+DA:1403,0
+DA:1404,0
+DA:1410,0
+DA:1411,0
+DA:1412,0
+DA:1413,0
+DA:1416,0
+DA:1417,0
+DA:1418,0
+DA:1419,0
+DA:1420,0
+DA:1421,0
+DA:1422,0
+DA:1424,0
+DA:1425,0
+DA:1430,0
+DA:1431,0
+DA:1432,0
+DA:1433,0
+DA:1436,0
+DA:1440,0
+DA:1441,0
+DA:1445,0
+DA:1450,0
+DA:1452,0
+DA:1458,0
+DA:1459,0
+DA:1460,0
+DA:1461,0
+DA:1464,0
+DA:1465,0
+DA:1466,0
+DA:1467,0
+DA:1468,0
+DA:1470,0
+DA:1471,0
+DA:1473,0
+DA:1476,0
+DA:1478,0
+DA:1479,0
+DA:1480,0
+DA:1481,0
+DA:1486,0
+DA:1488,0
+DA:1489,0
+DA:1490,0
+DA:1491,0
+DA:1492,0
+DA:1493,0
+DA:1494,0
+DA:1497,0
+DA:1498,0
+DA:1500,0
+DA:1502,0
+DA:1505,0
+DA:1508,0
+DA:1509,0
+DA:1510,0
+DA:1514,0
+DA:1518,0
+DA:1521,0
+DA:1522,0
+DA:1523,0
+DA:1524,0
+DA:1525,0
+DA:1526,0
+DA:1527,0
+DA:1533,0
+DA:1534,0
+DA:1535,0
+DA:1537,0
+DA:1538,0
+DA:1544,0
+DA:1545,0
+DA:1546,0
+DA:1547,0
+DA:1548,0
+DA:1551,0
+DA:1557,0
+DA:1558,0
+DA:1559,0
+DA:1560,0
+DA:1564,0
+DA:1570,0
+DA:1571,0
+DA:1572,0
+DA:1573,0
+DA:1577,0
+DA:1580,0
+DA:1584,0
+DA:1585,0
+DA:1586,0
+DA:1587,0
+DA:1588,0
+DA:1592,0
+DA:1595,0
+DA:1596,0
+DA:1599,0
+DA:1601,0
+DA:1602,0
+DA:1605,0
+DA:1609,0
+DA:1610,0
+DA:1611,0
+DA:1617,0
+DA:1618,0
+DA:1619,0
+DA:1620,0
+DA:1623,0
+DA:1624,0
+DA:1625,0
+DA:1626,0
+DA:1627,0
+DA:1628,0
+DA:1629,0
+DA:1631,0
+DA:1632,0
+DA:1633,0
+DA:1634,0
+DA:1640,0
+DA:1641,0
+DA:1642,0
+DA:1643,0
+DA:1644,0
+DA:1647,0
+DA:1651,0
+DA:1652,0
+DA:1656,0
+DA:1661,0
+DA:1663,0
+DA:1669,0
+DA:1670,0
+DA:1671,0
+DA:1672,0
+DA:1675,0
+DA:1676,0
+DA:1677,0
+DA:1678,0
+DA:1679,0
+DA:1681,0
+DA:1682,0
+DA:1684,0
+DA:1687,0
+DA:1689,0
+DA:1690,0
+DA:1691,0
+DA:1697,0
+DA:1698,0
+DA:1699,0
+DA:1700,0
+DA:1702,0
+DA:1703,0
+DA:1705,0
+DA:1709,0
+DA:1712,0
+DA:1713,0
+DA:1714,0
+DA:1718,0
+DA:1722,0
+DA:1725,0
+DA:1728,0
+DA:1729,0
+DA:1730,0
+DA:1731,0
+DA:1732,0
+DA:1733,0
+DA:1734,0
+DA:1740,96
+DA:1743,96
+DA:1744,96
+DA:1745,96
+DA:1746,96
+DA:1747,96
+DA:1748,384
+DA:1749,288
+DA:1751,96
+DA:1752,96
+DA:1754,288
+DA:1755,192
+DA:1756,192
+DA:1758,96
+DA:1759,96
+DA:1761,192
+DA:1762,96
+DA:1763,96
+DA:1765,96
+DA:1766,96
+DA:1767,96
+DA:1769,96
+DA:1770,96
+DA:1776,96
+DA:1780,96
+DA:1781,96
+DA:1782,96
+DA:1783,96
+DA:1784,96
+DA:1785,96
+DA:1786,96
+DA:1787,96
+DA:1788,96
+DA:1794,0
+DA:1798,0
+DA:1799,0
+DA:1800,0
+DA:1801,0
+DA:1802,0
+DA:1804,0
+DA:1805,0
+DA:1806,0
+DA:1807,0
+DA:1808,0
+DA:1809,0
+LF:868
+LH:222
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-qfunction.c
+FNL:0,17,45
+FNA:0,2304,CeedQFunctionApply_Ref
+FNL:1,50,58
+FNA:1,288,CeedQFunctionDestroy_Ref
+FNL:2,63,76
+FNA:2,288,CeedQFunctionCreate_Ref
+FNF:3
+FNH:3
+DA:17,2304
+DA:18,2304
+DA:20,2304
+DA:23,2304
+DA:24,2304
+DA:25,2304
+DA:26,2304
+DA:28,6912
+DA:29,4608
+DA:31,4608
+DA:32,2304
+DA:35,2304
+DA:37,6912
+DA:38,4608
+DA:40,4608
+DA:41,2304
+DA:43,2304
+DA:44,2304
+DA:50,288
+DA:53,288
+DA:54,288
+DA:55,288
+DA:56,288
+DA:57,288
+DA:63,288
+DA:67,288
+DA:68,288
+DA:69,288
+DA:70,288
+DA:71,288
+DA:72,288
+DA:73,288
+DA:74,288
+DA:75,288
+LF:34
+LH:34
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-qfunctioncontext.c
+FNL:0,103,103
+FNA:0,1188,CeedQFunctionContextRestoreData_Ref
+FNL:1,108,115
+FNA:1,216,CeedQFunctionContextDestroy_Ref
+FNL:2,120,138
+FNA:2,216,CeedQFunctionContextCreate_Ref
+FNL:3,18,24
+FNA:3,1188,CeedQFunctionContextHasValidData_Ref
+FNL:4,29,36
+FNA:4,0,CeedQFunctionContextHasBorrowedDataOfType_Ref
+FNL:5,41,68
+FNA:5,216,CeedQFunctionContextSetData_Ref
+FNL:6,73,84
+FNA:6,0,CeedQFunctionContextTakeData_Ref
+FNL:7,89,98
+FNA:7,1188,CeedQFunctionContextGetData_Ref
+FNF:8
+FNH:6
+DA:18,1188
+DA:21,1188
+DA:22,1188
+DA:23,1188
+DA:29,0
+DA:32,0
+DA:33,0
+DA:34,0
+DA:35,0
+DA:41,216
+DA:45,216
+DA:46,216
+DA:48,216
+DA:50,216
+DA:51,216
+DA:52,0
+DA:53,0
+DA:54,0
+DA:55,0
+DA:56,0
+DA:57,0
+DA:58,72
+DA:59,72
+DA:60,72
+DA:61,72
+DA:62,72
+DA:63,144
+DA:64,144
+DA:65,144
+DA:67,216
+DA:73,0
+DA:76,0
+DA:78,0
+DA:80,0
+DA:81,0
+DA:82,0
+DA:83,0
+DA:89,1188
+DA:92,1188
+DA:94,1188
+DA:96,1188
+DA:97,1188
+DA:103,1188
+DA:108,216
+DA:111,216
+DA:112,216
+DA:113,216
+DA:114,216
+DA:120,216
+DA:124,216
+DA:125,216
+DA:126,216
+DA:127,216
+DA:128,216
+DA:129,216
+DA:130,216
+DA:131,216
+DA:132,216
+DA:133,216
+DA:134,216
+DA:135,216
+DA:136,216
+DA:137,216
+LF:63
+LH:45
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-restriction.c
+FNL:0,138,179
+FNA:0,0,CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core
+FNL:1,181,218
+FNA:1,816,CeedElemRestrictionApplyStridedTranspose_Ref_Core
+FNL:2,19,57
+FNA:2,144,CeedElemRestrictionApplyStridedNoTranspose_Ref_Core
+FNL:3,220,242
+FNA:3,816,CeedElemRestrictionApplyOffsetTranspose_Ref_Core
+FNL:4,244,266
+FNA:4,0,CeedElemRestrictionApplyOrientedTranspose_Ref_Core
+FNL:5,268,317
+FNA:5,0,CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core
+FNL:6,319,369
+FNA:6,0,CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core
+FNL:7,371,394
+FNA:7,0,CeedElemRestrictionApplyAtPointsInElement_Ref_Core
+FNL:8,396,503
+FNA:8,3408,CeedElemRestrictionApply_Ref_Core
+FNL:9,508,512
+FNA:9,2680,CeedElemRestrictionApply_Ref_110
+FNL:10,514,518
+FNA:10,0,CeedElemRestrictionApply_Ref_111
+FNL:11,520,524
+FNA:11,504,CeedElemRestrictionApply_Ref_180
+FNL:12,526,530
+FNA:12,0,CeedElemRestrictionApply_Ref_181
+FNL:13,532,536
+FNA:13,40,CeedElemRestrictionApply_Ref_310
+FNL:14,538,542
+FNA:14,0,CeedElemRestrictionApply_Ref_311
+FNL:15,544,548
+FNA:15,24,CeedElemRestrictionApply_Ref_380
+FNL:16,550,554
+FNA:16,0,CeedElemRestrictionApply_Ref_381
+FNL:17,588,592
+FNA:17,0,CeedElemRestrictionApply_Ref_511
+FNL:18,59,75
+FNA:18,1632,CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core
+FNL:19,602,606
+FNA:19,0,CeedElemRestrictionApply_Ref_581
+FNL:20,611,622
+FNA:20,336,CeedElemRestrictionApply_Ref
+FNL:21,627,639
+FNA:21,0,CeedElemRestrictionApplyUnsigned_Ref
+FNL:22,644,656
+FNA:22,0,CeedElemRestrictionApplyUnoriented_Ref
+FNL:23,661,669
+FNA:23,0,CeedElemRestrictionApplyAtPointsInElement_Ref
+FNL:24,674,685
+FNA:24,3072,CeedElemRestrictionApplyBlock_Ref
+FNL:25,690,699
+FNA:25,360,CeedElemRestrictionGetOffsets_Ref
+FNL:26,704,713
+FNA:26,0,CeedElemRestrictionGetOrientations_Ref
+FNL:27,718,727
+FNA:27,0,CeedElemRestrictionGetCurlOrientations_Ref
+FNL:28,732,741
+FNA:28,816,CeedElemRestrictionDestroy_Ref
+FNL:29,746,910
+FNA:29,1032,CeedElemRestrictionCreate_Ref
+FNL:30,77,94
+FNA:30,0,CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core
+FNL:31,96,136
+FNA:31,0,CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core
+FNF:32
+FNH:14
+DA:19,144
+DA:26,144
+DA:27,144
+DA:30,1296
+DA:31,2304
+DA:32,29664
+DA:33,103392
+DA:34,74880
+DA:35,74880
+DA:44,0
+DA:45,0
+DA:46,0
+DA:47,0
+DA:48,0
+DA:49,0
+DA:50,0
+DA:56,144
+DA:59,1632
+DA:66,1632
+DA:67,3936
+DA:68,4968
+DA:69,166824
+DA:70,164160
+DA:74,1632
+DA:77,0
+DA:84,0
+DA:85,0
+DA:86,0
+DA:87,0
+DA:88,0
+DA:89,0
+DA:93,0
+DA:96,0
+DA:103,0
+DA:104,0
+DA:105,0
+DA:106,0
+DA:108,0
+DA:109,0
+DA:110,0
+DA:111,0
+DA:112,0
+DA:113,0
+DA:115,0
+DA:116,0
+DA:117,0
+DA:118,0
+DA:119,0
+DA:120,0
+DA:121,0
+DA:122,0
+DA:123,0
+DA:126,0
+DA:127,0
+DA:128,0
+DA:129,0
+DA:130,0
+DA:131,0
+DA:135,0
+DA:138,0
+DA:146,0
+DA:147,0
+DA:148,0
+DA:149,0
+DA:151,0
+DA:152,0
+DA:153,0
+DA:154,0
+DA:155,0
+DA:156,0
+DA:158,0
+DA:159,0
+DA:160,0
+DA:161,0
+DA:162,0
+DA:163,0
+DA:164,0
+DA:165,0
+DA:166,0
+DA:169,0
+DA:170,0
+DA:171,0
+DA:172,0
+DA:173,0
+DA:174,0
+DA:178,0
+DA:181,816
+DA:188,816
+DA:189,816
+DA:192,1968
+DA:193,2304
+DA:194,29664
+DA:195,72288
+DA:196,43776
+DA:205,0
+DA:206,0
+DA:207,0
+DA:208,0
+DA:209,0
+DA:210,0
+DA:211,0
+DA:217,816
+DA:220,816
+DA:227,816
+DA:228,1968
+DA:229,2304
+DA:230,19872
+DA:232,48000
+DA:235,29280
+DA:236,29280
+DA:241,816
+DA:244,0
+DA:251,0
+DA:252,0
+DA:253,0
+DA:254,0
+DA:256,0
+DA:259,0
+DA:260,0
+DA:265,0
+DA:268,0
+DA:271,0
+DA:274,0
+DA:276,0
+DA:277,0
+DA:278,0
+DA:280,0
+DA:281,0
+DA:283,0
+DA:284,0
+DA:285,0
+DA:286,0
+DA:287,0
+DA:289,0
+DA:290,0
+DA:292,0
+DA:293,0
+DA:294,0
+DA:295,0
+DA:296,0
+DA:297,0
+DA:298,0
+DA:299,0
+DA:301,0
+DA:302,0
+DA:305,0
+DA:306,0
+DA:307,0
+DA:308,0
+DA:309,0
+DA:311,0
+DA:312,0
+DA:316,0
+DA:319,0
+DA:323,0
+DA:326,0
+DA:328,0
+DA:329,0
+DA:330,0
+DA:332,0
+DA:333,0
+DA:335,0
+DA:336,0
+DA:337,0
+DA:338,0
+DA:339,0
+DA:341,0
+DA:342,0
+DA:344,0
+DA:345,0
+DA:346,0
+DA:347,0
+DA:348,0
+DA:349,0
+DA:350,0
+DA:351,0
+DA:353,0
+DA:354,0
+DA:357,0
+DA:358,0
+DA:359,0
+DA:360,0
+DA:361,0
+DA:363,0
+DA:364,0
+DA:368,0
+DA:371,0
+DA:375,0
+DA:378,0
+DA:379,0
+DA:380,0
+DA:381,0
+DA:382,0
+DA:383,0
+DA:384,0
+DA:387,0
+DA:388,0
+DA:391,0
+DA:393,0
+DA:396,3408
+DA:400,3408
+DA:405,3408
+DA:406,3408
+DA:407,3408
+DA:408,3408
+DA:409,3408
+DA:411,3408
+DA:413,1632
+DA:416,1776
+DA:418,3408
+DA:424,1632
+DA:425,816
+DA:426,816
+DA:428,816
+DA:429,816
+DA:430,816
+DA:432,816
+DA:433,0
+DA:434,0
+DA:435,0
+DA:438,0
+DA:441,0
+DA:442,0
+DA:443,0
+DA:444,0
+DA:446,0
+DA:447,0
+DA:450,0
+DA:453,0
+DA:454,0
+DA:455,0
+DA:456,0
+DA:464,1776
+DA:465,144
+DA:466,144
+DA:468,144
+DA:469,1632
+DA:470,1632
+DA:472,1632
+DA:473,0
+DA:474,0
+DA:475,0
+DA:478,0
+DA:481,0
+DA:482,0
+DA:483,0
+DA:484,0
+DA:486,0
+DA:487,0
+DA:490,0
+DA:493,0
+DA:494,0
+DA:495,0
+DA:496,0
+DA:499,3408
+DA:500,3408
+DA:501,3408
+DA:502,3408
+DA:508,2680
+DA:511,2680
+DA:514,0
+DA:517,0
+DA:520,504
+DA:523,504
+DA:526,0
+DA:529,0
+DA:532,40
+DA:535,40
+DA:538,0
+DA:541,0
+DA:544,24
+DA:547,24
+DA:550,0
+DA:553,0
+DA:588,0
+DA:591,0
+DA:602,0
+DA:605,0
+DA:611,336
+DA:615,336
+DA:616,336
+DA:617,336
+DA:618,336
+DA:619,336
+DA:620,336
+DA:621,336
+DA:627,0
+DA:632,0
+DA:633,0
+DA:634,0
+DA:635,0
+DA:636,0
+DA:637,0
+DA:638,0
+DA:644,0
+DA:649,0
+DA:650,0
+DA:651,0
+DA:652,0
+DA:653,0
+DA:654,0
+DA:655,0
+DA:661,0
+DA:666,0
+DA:667,0
+DA:668,0
+DA:674,3072
+DA:679,3072
+DA:680,3072
+DA:681,3072
+DA:682,3072
+DA:683,3072
+DA:684,3072
+DA:690,360
+DA:693,360
+DA:695,360
+DA:697,360
+DA:698,360
+DA:704,0
+DA:707,0
+DA:709,0
+DA:711,0
+DA:712,0
+DA:718,0
+DA:721,0
+DA:723,0
+DA:725,0
+DA:726,0
+DA:732,816
+DA:735,816
+DA:736,816
+DA:737,816
+DA:738,816
+DA:739,816
+DA:740,816
+DA:746,1032
+DA:749,1032
+DA:753,1032
+DA:754,1032
+DA:755,1032
+DA:756,1032
+DA:757,1032
+DA:758,1032
+DA:759,1032
+DA:760,1032
+DA:762,1032
+DA:764,1032
+DA:765,1032
+DA:770,1032
+DA:772,1032
+DA:773,1032
+DA:774,384
+DA:775,384
+DA:776,384
+DA:782,1032
+DA:783,0
+DA:785,0
+DA:786,0
+DA:788,0
+DA:789,0
+DA:792,0
+DA:793,0
+DA:797,1032
+DA:802,648
+DA:804,648
+DA:805,648
+DA:806,648
+DA:808,648
+DA:811,168
+DA:812,34328
+DA:813,34160
+DA:819,648
+DA:820,648
+DA:821,648
+DA:824,648
+DA:825,0
+DA:826,0
+DA:827,648
+DA:828,0
+DA:829,0
+DA:835,1032
+DA:837,1032
+DA:838,1032
+DA:839,544
+DA:840,544
+DA:841,544
+DA:842,0
+DA:843,0
+DA:844,0
+DA:845,312
+DA:846,312
+DA:847,312
+DA:848,0
+DA:849,0
+DA:850,0
+DA:851,64
+DA:852,64
+DA:853,64
+DA:854,0
+DA:855,0
+DA:856,0
+DA:857,24
+DA:858,24
+DA:859,24
+DA:860,0
+DA:861,0
+DA:862,0
+DA:880,0
+DA:881,0
+DA:882,0
+DA:888,0
+DA:889,0
+DA:890,0
+DA:891,88
+DA:892,88
+DA:893,88
+DA:897,1032
+DA:898,1032
+DA:899,1032
+DA:900,1032
+DA:901,0
+DA:903,1032
+DA:904,1032
+DA:905,1032
+DA:906,1032
+DA:907,1032
+DA:908,1032
+DA:909,1032
+LF:428
+LH:162
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-tensor.c
+FNL:0,16,38
+FNA:0,4032,CeedTensorContractApply_Ref
+FNL:1,43,43
+FNA:1,192,CeedTensorContractDestroy_Ref
+FNL:2,48,56
+FNA:2,192,CeedTensorContractCreate_Ref
+FNF:3
+FNH:3
+DA:16,4032
+DA:18,4032
+DA:20,4032
+DA:21,1008
+DA:22,1008
+DA:25,4032
+DA:26,830208
+DA:29,26160
+DA:30,145200
+DA:31,839616
+DA:32,716544
+DA:33,5454912
+DA:37,4032
+DA:43,192
+DA:48,192
+DA:51,192
+DA:52,192
+DA:53,192
+DA:54,192
+DA:55,192
+LF:20
+LH:20
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref-vector.c
+FNL:0,103,110
+FNA:0,8856,CeedVectorGetArrayWrite_Ref
+FNL:1,115,115
+FNA:1,10560,CeedVectorRestoreArray_Ref
+FNL:2,120,120
+FNA:2,11856,CeedVectorRestoreArrayRead_Ref
+FNL:3,125,132
+FNA:3,2784,CeedVectorDestroy_Ref
+FNL:4,137,156
+FNA:4,2784,CeedVectorCreate_Ref
+FNL:5,18,25
+FNA:5,13560,CeedVectorHasValidArray_Ref
+FNL:6,30,37
+FNA:6,0,CeedVectorHasBorrowedArrayOfType_Ref
+FNL:7,42,54
+FNA:7,4752,CeedVectorSetArray_Ref
+FNL:8,59,70
+FNA:8,0,CeedVectorTakeArray_Ref
+FNL:9,75,84
+FNA:9,22416,CeedVectorGetArrayCore_Ref
+FNL:10,89,91
+FNA:10,11856,CeedVectorGetArrayRead_Ref
+FNL:11,96,98
+FNA:11,1704,CeedVectorGetArray_Ref
+FNF:12
+FNH:10
+DA:18,13560
+DA:21,13560
+DA:23,13560
+DA:24,13560
+DA:30,0
+DA:33,0
+DA:34,0
+DA:35,0
+DA:36,0
+DA:42,4752
+DA:46,4752
+DA:47,4752
+DA:49,4752
+DA:51,4752
+DA:53,4752
+DA:59,0
+DA:62,0
+DA:64,0
+DA:66,0
+DA:67,0
+DA:68,0
+DA:69,0
+DA:75,22416
+DA:78,22416
+DA:80,22416
+DA:82,22416
+DA:83,22416
+DA:89,11856
+DA:90,11856
+DA:96,1704
+DA:97,1704
+DA:103,8856
+DA:106,8856
+DA:108,8856
+DA:109,8856
+DA:115,10560
+DA:120,11856
+DA:125,2784
+DA:128,2784
+DA:129,2784
+DA:130,2784
+DA:131,2784
+DA:137,2784
+DA:141,2784
+DA:142,2784
+DA:143,2784
+DA:144,2784
+DA:145,2784
+DA:146,2784
+DA:147,2784
+DA:148,2784
+DA:149,2784
+DA:150,2784
+DA:151,2784
+DA:152,2784
+DA:153,2784
+DA:154,2784
+DA:155,2784
+LF:58
+LH:46
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/ref/ceed-ref.c
+FNL:0,17,36
+FNA:0,192,CeedInit_Ref
+FNL:1,41,46
+FNA:1,192,CeedRegister_Ref
+FNF:2
+FNH:2
+DA:17,192
+DA:18,192
+DA:20,192
+DA:22,192
+DA:23,192
+DA:24,192
+DA:25,192
+DA:26,192
+DA:27,192
+DA:28,192
+DA:29,192
+DA:30,192
+DA:31,192
+DA:32,192
+DA:33,192
+DA:34,192
+DA:35,192
+DA:41,192
+DA:44,192
+LF:19
+LH:19
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/weak/ceed-backend-weak.c
+FNF:0
+FNH:0
+LF:0
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/xsmm/ceed-xsmm-blocked.c
+FNL:0,18,32
+FNA:0,24,CeedInit_Xsmm_Blocked
+FNL:1,37,37
+FNA:1,192,CeedRegister_Xsmm_Blocked
+FNF:2
+FNH:2
+DA:18,24
+DA:21,24
+DA:23,24
+DA:26,24
+DA:27,24
+DA:28,24
+DA:30,24
+DA:31,24
+DA:37,192
+LF:9
+LH:9
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/xsmm/ceed-xsmm-serial.c
+FNL:0,18,32
+FNA:0,24,CeedInit_Xsmm_Serial
+FNL:1,37,37
+FNA:1,192,CeedRegister_Xsmm_Serial
+FNF:2
+FNH:2
+DA:18,24
+DA:21,24
+DA:23,24
+DA:26,24
+DA:27,24
+DA:28,24
+DA:30,24
+DA:31,24
+DA:37,192
+LF:9
+LH:9
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/backends/xsmm/ceed-xsmm-tensor.c
+FNL:0,17,63
+FNA:0,2016,CeedTensorContractApply_Xsmm
+FNL:1,68,71
+FNA:1,96,CeedTensorContractCreate_Xsmm
+FNF:2
+FNH:2
+DA:17,2016
+DA:19,2016
+DA:21,1344
+DA:22,1344
+DA:23,1344
+DA:27,1344
+DA:29,1344
+DA:32,1344
+DA:35,1344
+DA:36,1344
+DA:37,1344
+DA:38,1344
+DA:41,672
+DA:42,672
+DA:43,672
+DA:47,672
+DA:49,672
+DA:52,672
+DA:55,672
+DA:56,4696
+DA:57,4024
+DA:58,4024
+DA:59,4024
+DA:62,2016
+DA:68,96
+DA:69,96
+DA:70,96
+LF:27
+LH:27
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/ex1-volumetest/(t*.test/(t*.-f.h))
+FNL:0,2,36
+FNA:0,384,build_mass_
+FNL:1,39,54
+FNA:1,384,apply_mass_
+FNF:2
+FNH:2
+DA:2,384
+DA:14,384
+DA:15,384
+DA:17,672
+DA:19,3648
+DA:20,3360
+DA:24,4752
+DA:25,4680
+DA:29,17664
+DA:32,17304
+DA:35,384
+DA:36,384
+DA:39,384
+DA:50,25344
+DA:51,25344
+DA:53,384
+DA:54,384
+LF:17
+LH:17
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume-f.f90
+FNL:0,157,190
+FNA:0,96,transformmeshcoords_
+FNL:1,193,250
+FNA:1,96,setcartesianmeshcoords_
+FNL:2,253,557
+FNA:2,96,MAIN__
+FNL:3,42,72
+FNA:3,96,getcartesianmeshsize_
+FNL:4,557,557
+FNA:4,96,main
+FNL:5,75,154
+FNA:5,192,buildcartesianrestriction_
+FNF:6
+FNH:6
+DA:42,96
+DA:51,96
+DA:52,96
+DA:58,384
+DA:59,288
+DA:60,288
+DA:62,96
+DA:64,288
+DA:65,192
+DA:66,192
+DA:67,64
+DA:68,64
+DA:70,288
+DA:72,96
+DA:75,192
+DA:100,192
+DA:104,192
+DA:105,192
+DA:106,192
+DA:107,192
+DA:108,192
+DA:110,576
+DA:111,384
+DA:112,384
+DA:113,576
+DA:115,192
+DA:119,192
+DA:120,192
+DA:122,2880
+DA:123,2688
+DA:124,2688
+DA:125,2688
+DA:126,2688
+DA:128,6144
+DA:129,3456
+DA:130,6144
+DA:133,41920
+DA:134,39040
+DA:135,39040
+DA:136,39040
+DA:138,122880
+DA:139,83840
+DA:140,83840
+DA:141,122880
+DA:143,41728
+DA:148,192
+DA:149,192
+DA:151,96
+DA:153,192
+DA:154,192
+DA:157,96
+DA:172,96
+DA:173,128
+DA:175,4160
+DA:176,4160
+DA:178,32
+DA:181,12160
+DA:182,12096
+DA:183,12096
+DA:185,12096
+DA:186,12160
+DA:188,160
+DA:190,96
+DA:193,96
+DA:210,96
+DA:211,96
+DA:214,96
+DA:215,96
+DA:217,288
+DA:218,192
+DA:219,288
+DA:222,96
+DA:223,96
+DA:226,96
+DA:227,96
+DA:228,96
+DA:229,96
+DA:230,576
+DA:231,576
+DA:234,16320
+DA:235,16224
+DA:237,51840
+DA:238,35520
+DA:239,35520
+DA:240,51744
+DA:243,96
+DA:245,96
+DA:247,96
+DA:248,96
+DA:249,96
+DA:250,96
+DA:253,96
+DA:267,96
+DA:282,96
+DA:283,96
+DA:284,96
+DA:285,96
+DA:286,96
+DA:287,96
+DA:288,96
+DA:289,96
+DA:290,96
+DA:291,96
+DA:292,96
+DA:296,96
+DA:297,624
+DA:298,528
+DA:300,96
+DA:303,0
+DA:306,96
+DA:309,96
+DA:310,96
+DA:311,96
+DA:314,0
+DA:315,0
+DA:318,0
+DA:319,0
+DA:322,0
+DA:323,0
+DA:326,0
+DA:327,0
+DA:330,0
+DA:331,0
+DA:335,96
+DA:338,528
+DA:343,96
+DA:344,96
+DA:345,96
+DA:347,0
+DA:352,96
+DA:354,0
+DA:355,0
+DA:356,0
+DA:357,0
+DA:358,0
+DA:359,0
+DA:360,0
+DA:361,0
+DA:362,0
+DA:364,0
+DA:366,0
+DA:367,0
+DA:368,0
+DA:370,0
+DA:377,96
+DA:381,96
+DA:382,96
+DA:385,96
+DA:386,96
+DA:388,0
+DA:389,0
+DA:390,0
+DA:392,0
+DA:393,0
+DA:395,0
+DA:401,96
+DA:403,96
+DA:405,96
+DA:407,0
+DA:408,0
+DA:414,96
+DA:415,96
+DA:418,96
+DA:419,96
+DA:420,96
+DA:422,96
+DA:424,96
+DA:427,96
+DA:428,16
+DA:430,16
+DA:433,16
+DA:436,48
+DA:441,48
+DA:442,48
+DA:443,48
+DA:444,48
+DA:445,48
+DA:449,96
+DA:450,96
+DA:451,96
+DA:452,96
+DA:455,96
+DA:456,96
+DA:457,288
+DA:458,288
+DA:460,96
+DA:461,96
+DA:464,96
+DA:465,48
+DA:469,48
+DA:470,48
+DA:471,48
+DA:472,48
+DA:476,96
+DA:477,96
+DA:478,96
+DA:479,96
+DA:482,96
+DA:483,96
+DA:485,96
+DA:486,96
+DA:487,96
+DA:488,96
+DA:489,96
+DA:490,96
+DA:493,96
+DA:496,96
+DA:499,96
+DA:501,0
+DA:504,96
+DA:506,96
+DA:511,96
+DA:513,96
+DA:514,16320
+DA:515,16320
+DA:517,96
+DA:519,96
+DA:521,0
+DA:522,0
+DA:523,0
+DA:524,0
+DA:527,96
+DA:528,32
+DA:530,0
+DA:534,64
+DA:536,0
+DA:543,96
+DA:544,96
+DA:545,96
+DA:546,96
+DA:547,96
+DA:548,96
+DA:549,96
+DA:550,96
+DA:551,96
+DA:552,96
+DA:553,96
+DA:554,96
+DA:555,96
+DA:556,96
+DA:557,96
+LF:241
+LH:200
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume.c
+FNL:0,294,316
+FNA:0,96,GetCartesianMeshSize
+FNL:1,318,363
+FNA:1,192,BuildCartesianRestriction
+FNL:2,365,394
+FNA:2,96,SetCartesianMeshCoords
+FNL:3,401,429
+FNA:3,96,TransformMeshCoords
+FNL:4,55,292
+FNA:4,96,main
+FNF:5
+FNH:5
+DA:55,96
+DA:56,96
+DA:57,96
+DA:58,96
+DA:59,96
+DA:60,96
+DA:61,96
+DA:62,96
+DA:63,96
+DA:66,432
+DA:97,96
+DA:100,96
+DA:121,96
+DA:126,96
+DA:127,96
+DA:130,96
+DA:132,96
+DA:133,96
+DA:146,96
+DA:147,96
+DA:148,96
+DA:158,96
+DA:159,96
+DA:162,96
+DA:168,96
+DA:169,96
+DA:170,96
+DA:175,96
+DA:177,48
+DA:178,48
+DA:179,48
+DA:182,48
+DA:183,48
+DA:184,48
+DA:185,48
+DA:186,48
+DA:192,96
+DA:193,96
+DA:194,96
+DA:195,96
+DA:199,96
+DA:200,96
+DA:202,288
+DA:203,96
+DA:204,96
+DA:209,96
+DA:211,48
+DA:214,48
+DA:215,48
+DA:216,48
+DA:217,48
+DA:223,96
+DA:224,96
+DA:225,96
+DA:226,96
+DA:231,96
+DA:232,96
+DA:235,96
+DA:238,96
+DA:241,96
+DA:246,96
+DA:253,96
+DA:258,96
+DA:259,16320
+DA:260,96
+DA:262,96
+DA:270,96
+DA:272,96
+DA:276,96
+DA:277,96
+DA:278,96
+DA:279,96
+DA:280,96
+DA:281,96
+DA:282,96
+DA:283,96
+DA:284,96
+DA:285,96
+DA:286,96
+DA:287,96
+DA:288,96
+DA:289,96
+DA:290,96
+DA:291,96
+DA:294,96
+DA:297,96
+DA:298,96
+DA:300,384
+DA:301,288
+DA:302,288
+DA:304,96
+DA:306,288
+DA:307,192
+DA:309,192
+DA:310,64
+DA:311,64
+DA:313,192
+DA:315,96
+DA:318,192
+DA:319,192
+DA:320,192
+DA:321,192
+DA:322,192
+DA:323,192
+DA:325,576
+DA:326,384
+DA:327,384
+DA:328,384
+DA:330,192
+DA:334,192
+DA:336,2880
+DA:337,2688
+DA:339,6144
+DA:340,3456
+DA:341,3456
+DA:343,2688
+DA:345,41728
+DA:346,39040
+DA:348,122880
+DA:349,83840
+DA:350,83840
+DA:351,83840
+DA:353,39040
+DA:356,192
+DA:358,192
+DA:359,96
+DA:361,192
+DA:362,192
+DA:365,96
+DA:366,96
+DA:367,96
+DA:369,288
+DA:370,192
+DA:371,192
+DA:375,96
+DA:376,96
+DA:379,96
+DA:380,576
+DA:381,16320
+DA:382,16224
+DA:384,51744
+DA:385,35520
+DA:387,35520
+DA:388,35520
+DA:391,96
+DA:392,96
+DA:393,96
+DA:401,96
+DA:405,96
+DA:406,96
+DA:407,4160
+DA:409,4128
+DA:411,32
+DA:413,64
+DA:415,12160
+DA:418,12096
+DA:420,12096
+DA:421,12096
+DA:422,12096
+DA:423,12096
+DA:425,64
+DA:427,96
+DA:428,96
+LF:163
+LH:163
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume.h
+FNL:0,16,52
+FNA:0,384,build_mass
+FNL:1,55,64
+FNA:1,384,apply_mass
+FNF:2
+FNH:2
+DA:16,384
+DA:17,384
+DA:21,384
+DA:22,384
+DA:24,384
+DA:25,288
+DA:26,288
+DA:29,3360
+DA:30,288
+DA:31,72
+DA:32,72
+DA:35,4680
+DA:36,4608
+DA:38,72
+DA:39,24
+DA:40,24
+DA:43,17304
+DA:44,17280
+DA:45,17280
+DA:46,17280
+DA:47,17280
+DA:49,24
+DA:51,384
+DA:55,384
+DA:58,384
+DA:59,384
+DA:62,25344
+DA:63,384
+LF:28
+LH:28
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/ceed-gallery-list.h
+FNF:0
+FNH:0
+DA:15,96
+DA:16,96
+DA:17,96
+DA:18,96
+DA:19,96
+DA:20,96
+DA:21,96
+DA:22,96
+DA:23,96
+DA:24,96
+DA:25,96
+DA:26,96
+DA:27,96
+DA:28,96
+DA:29,96
+DA:30,96
+LF:16
+LH:16
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/identity/ceed-identity.c
+FNL:0,17,36
+FNA:0,0,CeedQFunctionInit_Identity
+FNL:1,41,43
+FNA:1,96,CeedQFunctionRegister_Identity
+FNF:2
+FNH:1
+DA:17,0
+DA:19,0
+DA:20,0
+DA:24,0
+DA:28,0
+DA:29,0
+DA:30,0
+DA:31,0
+DA:32,0
+DA:33,0
+DA:35,0
+DA:41,96
+DA:42,96
+LF:13
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/mass-vector/ceed-vectormassapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Vector3MassApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Vector3MassApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/mass/ceed-mass1dbuild.c
+FNL:0,16,30
+FNA:0,32,CeedQFunctionInit_Mass1DBuild
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Mass1DBuild
+FNF:2
+FNH:2
+DA:16,32
+DA:18,32
+DA:19,32
+DA:22,32
+DA:23,32
+DA:24,32
+DA:25,32
+DA:27,32
+DA:29,32
+DA:35,96
+DA:36,96
+LF:11
+LH:11
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/mass/ceed-mass2dbuild.c
+FNL:0,16,30
+FNA:0,32,CeedQFunctionInit_Mass2DBuild
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Mass2DBuild
+FNF:2
+FNH:2
+DA:16,32
+DA:18,32
+DA:19,32
+DA:22,32
+DA:23,32
+DA:24,32
+DA:25,32
+DA:27,32
+DA:29,32
+DA:35,96
+DA:36,96
+LF:11
+LH:11
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/mass/ceed-mass3dbuild.c
+FNL:0,16,30
+FNA:0,32,CeedQFunctionInit_Mass3DBuild
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Mass3DBuild
+FNF:2
+FNH:2
+DA:16,32
+DA:18,32
+DA:19,32
+DA:22,32
+DA:23,32
+DA:24,32
+DA:25,32
+DA:27,32
+DA:29,32
+DA:35,96
+DA:36,96
+LF:11
+LH:11
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/mass/ceed-massapply.c
+FNL:0,16,29
+FNA:0,96,CeedQFunctionInit_MassApply
+FNL:1,34,36
+FNA:1,96,CeedQFunctionRegister_MassApply
+FNF:2
+FNH:2
+DA:16,96
+DA:18,96
+DA:19,96
+DA:22,96
+DA:23,96
+DA:24,96
+DA:26,96
+DA:28,96
+DA:34,96
+DA:35,96
+LF:10
+LH:10
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Vector3Poisson1DApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Vector3Poisson1DApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Vector3Poisson2DApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Vector3Poisson2DApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Vector3Poisson3DApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Vector3Poisson3DApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson1dapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Poisson1DApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Poisson1DApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson1dbuild.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Poisson1DBuild
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Poisson1DBuild
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson2dapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Poisson2DApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Poisson2DApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson2dbuild.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Poisson2DBuild
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Poisson2DBuild
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson3dapply.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Poisson3DApply
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Poisson3DApply
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/poisson/ceed-poisson3dbuild.c
+FNL:0,16,30
+FNA:0,0,CeedQFunctionInit_Poisson3DBuild
+FNL:1,35,37
+FNA:1,96,CeedQFunctionRegister_Poisson3DBuild
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:22,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:27,0
+DA:29,0
+DA:35,96
+DA:36,96
+LF:11
+LH:2
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/gallery/scale/ceed-scale.c
+FNL:0,16,24
+FNA:0,0,CeedQFunctionInit_Scale
+FNL:1,29,29
+FNA:1,96,CeedQFunctionRegister_Scale
+FNF:2
+FNH:1
+DA:16,0
+DA:18,0
+DA:19,0
+DA:23,0
+DA:29,96
+LF:5
+LH:1
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/ceed.h
+FNL:0,525,533
+FNA:0,14400,CeedIntPow
+FNL:1,545,545
+FNA:1,420064,CeedIntMin
+FNL:2,557,557
+FNA:2,0,CeedIntMax
+FNF:3
+FNH:2
+DA:525,14400
+DA:526,14400
+DA:527,20992
+DA:528,6592
+DA:529,6592
+DA:530,6592
+DA:532,14400
+DA:545,420064
+DA:557,0
+LF:9
+LH:8
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-identity.h
+FNL:0,17,31
+FNA:0,0,Identity
+FNF:1
+FNH:0
+DA:17,0
+DA:19,0
+DA:20,0
+DA:23,0
+DA:25,0
+DA:28,0
+DA:30,0
+LF:7
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
+FNL:0,13,24
+FNA:0,576,Mass1DBuild
+FNF:1
+FNH:1
+DA:13,576
+DA:16,576
+DA:18,576
+DA:21,6720
+DA:23,576
+LF:5
+LH:5
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
+FNL:0,13,26
+FNA:0,144,Mass2DBuild
+FNF:1
+FNH:1
+DA:13,144
+DA:16,144
+DA:18,144
+DA:21,9360
+DA:22,9216
+DA:25,144
+LF:6
+LH:6
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
+FNL:0,13,28
+FNA:0,48,Mass3DBuild
+FNF:1
+FNH:1
+DA:13,48
+DA:16,48
+DA:18,48
+DA:21,34608
+DA:22,34560
+DA:23,34560
+DA:24,34560
+DA:27,48
+LF:8
+LH:8
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-massapply.h
+FNL:0,13,24
+FNA:0,768,MassApply
+FNF:1
+FNH:1
+DA:13,768
+DA:16,768
+DA:18,768
+DA:21,50688
+DA:23,768
+LF:5
+LH:5
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
+FNL:0,13,25
+FNA:0,0,Poisson1DApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:19,0
+DA:22,0
+DA:24,0
+LF:5
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
+FNL:0,13,28
+FNA:0,0,Poisson1DBuild
+FNF:1
+FNH:0
+DA:13,0
+DA:19,0
+DA:22,0
+DA:25,0
+DA:27,0
+LF:5
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
+FNL:0,13,39
+FNA:0,0,Poisson2DApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:28,0
+DA:29,0
+DA:30,0
+DA:35,0
+DA:38,0
+LF:10
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
+FNL:0,13,38
+FNA:0,0,Poisson2DBuild
+FNF:1
+FNH:0
+DA:13,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:27,0
+DA:28,0
+DA:29,0
+DA:30,0
+DA:31,0
+DA:32,0
+DA:33,0
+DA:34,0
+DA:37,0
+LF:13
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
+FNL:0,13,41
+FNA:0,0,Poisson3DApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:29,0
+DA:30,0
+DA:31,0
+DA:32,0
+DA:37,0
+DA:40,0
+LF:11
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
+FNL:0,13,51
+FNA:0,0,Poisson3DBuild
+FNF:1
+FNH:0
+DA:13,0
+DA:17,0
+DA:19,0
+DA:21,0
+DA:24,0
+DA:27,0
+DA:28,0
+DA:31,0
+DA:32,0
+DA:35,0
+DA:42,0
+DA:43,0
+DA:44,0
+DA:45,0
+DA:46,0
+DA:47,0
+DA:50,0
+LF:17
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-scale.h
+FNL:0,13,27
+FNA:0,0,Scale
+FNF:1
+FNH:0
+DA:13,0
+DA:15,0
+DA:19,0
+DA:20,0
+DA:22,0
+DA:25,0
+DA:26,0
+LF:7
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-vectormassapply.h
+FNL:0,13,30
+FNA:0,0,Vector3MassApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:29,0
+LF:8
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
+FNL:0,13,30
+FNA:0,0,Vector3Poisson1DApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:24,0
+DA:25,0
+DA:29,0
+LF:8
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
+FNL:0,13,40
+FNA:0,0,Vector3Poisson2DApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:28,0
+DA:29,0
+DA:30,0
+DA:35,0
+DA:36,0
+DA:39,0
+LF:11
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
+FNL:0,13,43
+FNA:0,0,Vector3Poisson3DApply
+FNF:1
+FNH:0
+DA:13,0
+DA:16,0
+DA:18,0
+DA:20,0
+DA:23,0
+DA:29,0
+DA:30,0
+DA:31,0
+DA:32,0
+DA:37,0
+DA:38,0
+DA:39,0
+DA:42,0
+LF:13
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-basis.c
+FNL:0,1078,1081
+FNA:0,6528,CeedBasisGetFESpace
+FNL:1,1093,1096
+FNA:1,0,CeedBasisGetTopologyDimension
+FNL:2,1108,1111
+FNA:2,4800,CeedBasisGetTensorContract
+FNL:3,1123,1127
+FNA:3,384,CeedBasisSetTensorContract
+FNL:4,1146,1156
+FNA:4,384,CeedMatrixMatrixMultiply
+FNL:5,1171,1207
+FNA:5,384,CeedQRFactorization
+FNL:6,1228,1241
+FNA:6,384,CeedHouseholderApplyQ
+FNL:7,1256,1285
+FNA:7,384,CeedMatrixPseudoinverse
+FNL:8,129,146
+FNA:8,0,CeedGivensRotation
+FNL:9,1300,1425
+FNA:9,0,CeedSymmetricSchurDecomposition
+FNL:10,1447,1507
+FNA:10,0,CeedSimultaneousDiagonalization
+FNL:11,1536,1576
+FNA:11,864,CeedBasisCreateTensorH1
+FNL:12,1594,1653
+FNA:12,384,CeedBasisCreateTensorH1Lagrange
+FNL:13,163,178
+FNA:13,0,CeedScalarView
+FNL:14,1673,1712
+FNA:14,0,CeedBasisCreateH1
+FNL:15,1732,1771
+FNA:15,0,CeedBasisCreateHdiv
+FNL:16,1791,1831
+FNA:16,0,CeedBasisCreateHcurl
+FNL:17,1856,1899
+FNA:17,0,CeedBasisCreateProjection
+FNL:18,190,193
+FNA:18,0,CeedBasisView_Object
+FNL:19,1914,1919
+FNA:19,6528,CeedBasisReferenceCopy
+FNL:20,1931,1934
+FNA:20,0,CeedBasisSetNumViewTabs
+FNL:21,1946,1949
+FNA:21,0,CeedBasisGetNumViewTabs
+FNL:22,1961,2037
+FNA:22,0,CeedBasisView
+FNL:23,204,207
+FNA:23,0,CeedBasisDestroy_Object
+FNL:24,2060,2065
+FNA:24,4800,CeedBasisApply
+FNL:25,2088,2094
+FNA:25,0,CeedBasisApplyAdd
+FNL:26,2116,2125
+FNA:26,0,CeedBasisApplyAtPoints
+FNL:27,2147,2157
+FNA:27,0,CeedBasisApplyAddAtPoints
+FNL:28,2169,2172
+FNA:28,768,CeedBasisGetCeed
+FNL:29,2183,2183
+FNA:29,0,CeedBasisReturnCeed
+FNL:30,2195,2198
+FNA:30,20736,CeedBasisGetDimension
+FNL:31,2210,2213
+FNA:31,0,CeedBasisGetTopology
+FNL:32,2225,2228
+FNA:32,13248,CeedBasisGetNumComponents
+FNL:33,2240,2243
+FNA:33,10176,CeedBasisGetNumNodes
+FNL:34,2255,2259
+FNA:34,5184,CeedBasisGetNumNodes1D
+FNL:35,226,322
+FNA:35,0,CeedBasisCreateProjectionMatrices
+FNL:36,2271,2274
+FNA:36,10368,CeedBasisGetNumQuadraturePoints
+FNL:37,2286,2290
+FNA:37,5184,CeedBasisGetNumQuadraturePoints1D
+FNL:38,2302,2305
+FNA:38,0,CeedBasisGetQRef
+FNL:39,2317,2320
+FNA:39,192,CeedBasisGetQWeights
+FNL:40,2332,2354
+FNA:40,0,CeedBasisGetInterp
+FNL:41,2366,2373
+FNA:41,4992,CeedBasisGetInterp1D
+FNL:42,2385,2410
+FNA:42,0,CeedBasisGetGrad
+FNL:43,2422,2429
+FNA:43,384,CeedBasisGetGrad1D
+FNL:44,2441,2444
+FNA:44,0,CeedBasisGetDiv
+FNL:45,2456,2459
+FNA:45,0,CeedBasisGetCurl
+FNL:46,2470,2490
+FNA:46,13440,CeedBasisDestroy
+FNL:47,2503,2542
+FNA:47,384,CeedGaussQuadrature
+FNL:48,2555,2608
+FNA:48,576,CeedLobattoQuadrature
+FNL:49,345,376
+FNA:49,4800,CeedBasisApplyCheckDims
+FNL:50,398,453
+FNA:50,0,CeedBasisApplyAtPointsCheckDims
+FNL:51,476,677
+FNA:51,0,CeedBasisApplyAtPoints_Core
+FNL:52,48,53
+FNA:52,0,CeedChebyshevPolynomialsAtPoint
+FNL:53,66,80
+FNA:53,0,CeedChebyshevDerivativeAtPoint
+FNL:54,708,721
+FNA:54,0,CeedBasisCreateH1Fallback
+FNL:55,733,754
+FNA:55,384,CeedBasisGetCollocatedGrad
+FNL:56,766,796
+FNA:56,0,CeedBasisGetChebyshevInterp1D
+FNL:57,808,811
+FNA:57,10176,CeedBasisIsTensor
+FNL:58,823,837
+FNA:58,384,CeedBasisIsCollocated
+FNL:59,849,852
+FNA:59,5184,CeedBasisGetData
+FNL:60,864,867
+FNA:60,384,CeedBasisSetData
+FNL:61,878,881
+FNA:61,6144,CeedBasisReference
+FNL:62,897,923
+FNA:62,10368,CeedBasisGetNumQuadratureComponents
+FNL:63,937,1066
+FNA:63,0,CeedBasisGetFlopsEstimate
+FNL:64,99,108
+FNA:64,3840,CeedHouseholderReflect
+FNF:65
+FNH:33
+DA:48,0
+DA:49,0
+DA:50,0
+DA:51,0
+DA:52,0
+DA:66,0
+DA:69,0
+DA:70,0
+DA:71,0
+DA:72,0
+DA:73,0
+DA:74,0
+DA:75,0
+DA:76,0
+DA:77,0
+DA:79,0
+DA:99,3840
+DA:100,19200
+DA:101,15360
+DA:103,65280
+DA:104,15360
+DA:105,65280
+DA:107,3840
+DA:129,0
+DA:130,0
+DA:132,0
+DA:133,0
+DA:134,0
+DA:135,0
+DA:139,0
+DA:140,0
+DA:142,0
+DA:143,0
+DA:145,0
+DA:163,0
+DA:164,0
+DA:165,0
+DA:169,0
+DA:170,0
+DA:172,0
+DA:173,0
+DA:174,0
+DA:175,0
+DA:177,0
+DA:190,0
+DA:191,0
+DA:192,0
+DA:204,0
+DA:205,0
+DA:206,0
+DA:226,0
+DA:231,0
+DA:232,0
+DA:233,0
+DA:237,0
+DA:243,0
+DA:244,0
+DA:245,0
+DA:247,0
+DA:248,0
+DA:249,0
+DA:250,0
+DA:252,0
+DA:253,0
+DA:259,0
+DA:260,0
+DA:261,0
+DA:267,0
+DA:269,0
+DA:271,0
+DA:272,0
+DA:273,0
+DA:274,0
+DA:276,0
+DA:277,0
+DA:278,0
+DA:280,0
+DA:281,0
+DA:286,0
+DA:287,0
+DA:288,0
+DA:290,0
+DA:293,0
+DA:296,0
+DA:297,0
+DA:299,0
+DA:300,0
+DA:302,0
+DA:303,0
+DA:304,0
+DA:305,0
+DA:306,0
+DA:308,0
+DA:310,0
+DA:311,0
+DA:313,0
+DA:314,0
+DA:319,0
+DA:320,0
+DA:321,0
+DA:345,4800
+DA:347,4800
+DA:349,4800
+DA:350,4800
+DA:351,4800
+DA:352,4800
+DA:353,4800
+DA:354,4800
+DA:355,4800
+DA:358,4800
+DA:359,4800
+DA:360,4608
+DA:365,1536
+DA:366,9216
+DA:367,3072
+DA:368,3072
+DA:369,4608
+DA:370,192
+DA:371,192
+DA:372,192
+DA:374,4800
+DA:375,4800
+DA:398,0
+DA:400,0
+DA:401,0
+DA:403,0
+DA:404,0
+DA:405,0
+DA:406,0
+DA:407,0
+DA:408,0
+DA:409,0
+DA:410,0
+DA:411,0
+DA:414,0
+DA:415,0
+DA:422,0
+DA:426,0
+DA:427,0
+DA:428,0
+DA:429,0
+DA:430,0
+DA:431,0
+DA:432,0
+DA:433,0
+DA:434,0
+DA:435,0
+DA:436,0
+DA:437,0
+DA:438,0
+DA:439,0
+DA:440,0
+DA:441,0
+DA:442,0
+DA:451,0
+DA:452,0
+DA:476,0
+DA:478,0
+DA:480,0
+DA:482,0
+DA:483,0
+DA:484,0
+DA:485,0
+DA:491,0
+DA:492,0
+DA:495,0
+DA:497,0
+DA:498,0
+DA:499,0
+DA:501,0
+DA:507,0
+DA:508,0
+DA:509,0
+DA:510,0
+DA:511,0
+DA:513,0
+DA:514,0
+DA:515,0
+DA:519,0
+DA:520,0
+DA:521,0
+DA:522,0
+DA:526,0
+DA:528,0
+DA:530,0
+DA:532,0
+DA:534,0
+DA:536,0
+DA:537,0
+DA:538,0
+DA:542,0
+DA:543,0
+DA:549,0
+DA:552,0
+DA:553,0
+DA:554,0
+DA:555,0
+DA:556,0
+DA:557,0
+DA:560,0
+DA:561,0
+DA:563,0
+DA:565,0
+DA:566,0
+DA:568,0
+DA:569,0
+DA:571,0
+DA:573,0
+DA:575,0
+DA:576,0
+DA:579,0
+DA:581,0
+DA:582,0
+DA:584,0
+DA:586,0
+DA:587,0
+DA:588,0
+DA:590,0
+DA:591,0
+DA:593,0
+DA:596,0
+DA:598,0
+DA:600,0
+DA:602,0
+DA:603,0
+DA:604,0
+DA:605,0
+DA:607,0
+DA:614,0
+DA:615,0
+DA:616,0
+DA:618,0
+DA:619,0
+DA:620,0
+DA:623,0
+DA:624,0
+DA:626,0
+DA:627,0
+DA:629,0
+DA:630,0
+DA:632,0
+DA:633,0
+DA:636,0
+DA:638,0
+DA:639,0
+DA:642,0
+DA:644,0
+DA:645,0
+DA:647,0
+DA:648,0
+DA:650,0
+DA:651,0
+DA:652,0
+DA:655,0
+DA:656,0
+DA:660,0
+DA:662,0
+DA:664,0
+DA:666,0
+DA:667,0
+DA:668,0
+DA:671,0
+DA:672,0
+DA:673,0
+DA:676,0
+DA:708,0
+DA:710,0
+DA:713,0
+DA:714,0
+DA:716,0
+DA:717,0
+DA:718,0
+DA:719,0
+DA:720,0
+DA:733,384
+DA:740,384
+DA:741,384
+DA:742,384
+DA:745,384
+DA:746,384
+DA:747,384
+DA:748,384
+DA:749,384
+DA:751,384
+DA:752,384
+DA:753,384
+DA:766,0
+DA:772,0
+DA:773,0
+DA:774,0
+DA:778,0
+DA:779,0
+DA:780,0
+DA:781,0
+DA:784,0
+DA:785,0
+DA:788,0
+DA:789,0
+DA:792,0
+DA:793,0
+DA:794,0
+DA:795,0
+DA:808,10176
+DA:809,10176
+DA:810,10176
+DA:823,384
+DA:824,384
+DA:825,0
+DA:827,0
+DA:828,0
+DA:829,0
+DA:830,0
+DA:834,384
+DA:836,384
+DA:849,5184
+DA:850,5184
+DA:851,5184
+DA:864,384
+DA:865,384
+DA:866,384
+DA:878,6144
+DA:879,6144
+DA:880,6144
+DA:897,10368
+DA:900,10368
+DA:901,10368
+DA:902,6528
+DA:905,6528
+DA:906,6528
+DA:907,6528
+DA:908,3264
+DA:909,3264
+DA:910,3264
+DA:911,0
+DA:912,0
+DA:913,0
+DA:914,0
+DA:915,0
+DA:916,0
+DA:917,576
+DA:919,576
+DA:920,576
+DA:922,10368
+DA:937,0
+DA:941,0
+DA:942,0
+DA:943,0
+DA:946,0
+DA:947,0
+DA:948,0
+DA:949,0
+DA:950,0
+DA:951,0
+DA:952,0
+DA:954,0
+DA:956,0
+DA:957,0
+DA:958,0
+DA:959,0
+DA:961,0
+DA:962,0
+DA:967,0
+DA:968,0
+DA:971,0
+DA:972,0
+DA:974,0
+DA:975,0
+DA:976,0
+DA:977,0
+DA:980,0
+DA:981,0
+DA:982,0
+DA:983,0
+DA:984,0
+DA:985,0
+DA:986,0
+DA:987,0
+DA:988,0
+DA:990,0
+DA:992,0
+DA:994,0
+DA:995,0
+DA:996,0
+DA:997,0
+DA:998,0
+DA:1000,0
+DA:1002,0
+DA:1004,0
+DA:1006,0
+DA:1014,0
+DA:1015,0
+DA:1016,0
+DA:1019,0
+DA:1020,0
+DA:1021,0
+DA:1022,0
+DA:1023,0
+DA:1024,0
+DA:1025,0
+DA:1026,0
+DA:1027,0
+DA:1028,0
+DA:1029,0
+DA:1037,0
+DA:1038,0
+DA:1039,0
+DA:1045,0
+DA:1046,0
+DA:1047,0
+DA:1048,0
+DA:1049,0
+DA:1050,0
+DA:1051,0
+DA:1052,0
+DA:1053,0
+DA:1054,0
+DA:1058,0
+DA:1059,0
+DA:1060,0
+DA:1061,0
+DA:1062,0
+DA:1065,0
+DA:1078,6528
+DA:1079,6528
+DA:1080,6528
+DA:1093,0
+DA:1094,0
+DA:1095,0
+DA:1108,4800
+DA:1109,4800
+DA:1110,4800
+DA:1123,384
+DA:1124,384
+DA:1125,384
+DA:1126,384
+DA:1146,384
+DA:1147,2688
+DA:1148,16128
+DA:1149,13824
+DA:1151,82944
+DA:1152,13824
+DA:1155,384
+DA:1171,384
+DA:1172,384
+DA:1175,384
+DA:1177,2304
+DA:1178,1920
+DA:1180,1920
+DA:1181,0
+DA:1182,0
+DA:1185,1920
+DA:1186,7680
+DA:1187,5760
+DA:1188,5760
+DA:1190,1920
+DA:1191,1920
+DA:1193,1920
+DA:1197,1920
+DA:1198,7680
+DA:1201,1920
+DA:1203,1920
+DA:1204,7680
+DA:1206,384
+DA:1228,384
+DA:1232,384
+DA:1233,2304
+DA:1234,1920
+DA:1235,7680
+DA:1237,1920
+DA:1239,384
+DA:1240,384
+DA:1256,384
+DA:1259,384
+DA:1260,384
+DA:1261,384
+DA:1262,384
+DA:1265,384
+DA:1268,2688
+DA:1269,384
+DA:1271,2688
+DA:1272,2304
+DA:1273,11520
+DA:1274,9216
+DA:1275,32256
+DA:1276,9216
+DA:1281,384
+DA:1282,384
+DA:1283,384
+DA:1284,384
+DA:1300,0
+DA:1302,0
+DA:1304,0
+DA:1307,0
+DA:1308,0
+DA:1309,0
+DA:1313,0
+DA:1315,0
+DA:1317,0
+DA:1318,0
+DA:1319,0
+DA:1320,0
+DA:1322,0
+DA:1323,0
+DA:1325,0
+DA:1329,0
+DA:1330,0
+DA:1333,0
+DA:1334,0
+DA:1335,0
+DA:1338,0
+DA:1339,0
+DA:1342,0
+DA:1343,0
+DA:1344,0
+DA:1345,0
+DA:1349,0
+DA:1350,0
+DA:1351,0
+DA:1352,0
+DA:1353,0
+DA:1354,0
+DA:1356,0
+DA:1361,0
+DA:1362,0
+DA:1364,0
+DA:1366,0
+DA:1367,0
+DA:1368,0
+DA:1369,0
+DA:1370,0
+DA:1372,0
+DA:1373,0
+DA:1374,0
+DA:1376,0
+DA:1379,0
+DA:1380,0
+DA:1381,0
+DA:1382,0
+DA:1383,0
+DA:1385,0
+DA:1387,0
+DA:1389,0
+DA:1390,0
+DA:1391,0
+DA:1393,0
+DA:1394,0
+DA:1396,0
+DA:1398,0
+DA:1399,0
+DA:1404,0
+DA:1405,0
+DA:1408,0
+DA:1411,0
+DA:1412,0
+DA:1413,0
+DA:1416,0
+DA:1420,0
+DA:1423,0
+DA:1424,0
+DA:1447,0
+DA:1450,0
+DA:1451,0
+DA:1452,0
+DA:1455,0
+DA:1456,0
+DA:1459,0
+DA:1460,0
+DA:1461,0
+DA:1462,0
+DA:1463,0
+DA:1471,0
+DA:1474,0
+DA:1475,0
+DA:1476,0
+DA:1477,0
+DA:1481,0
+DA:1483,0
+DA:1486,0
+DA:1489,0
+DA:1490,0
+DA:1491,0
+DA:1492,0
+DA:1493,0
+DA:1500,0
+DA:1503,0
+DA:1504,0
+DA:1505,0
+DA:1506,0
+DA:1536,864
+DA:1538,864
+DA:1541,480
+DA:1542,480
+DA:1543,480
+DA:1544,480
+DA:1545,480
+DA:1548,384
+DA:1549,384
+DA:1550,384
+DA:1551,384
+DA:1553,384
+DA:1555,384
+DA:1556,384
+DA:1557,384
+DA:1558,384
+DA:1559,384
+DA:1560,384
+DA:1561,384
+DA:1562,384
+DA:1563,384
+DA:1564,384
+DA:1565,384
+DA:1566,384
+DA:1567,384
+DA:1568,384
+DA:1569,384
+DA:1570,384
+DA:1571,384
+DA:1572,384
+DA:1573,384
+DA:1574,384
+DA:1575,384
+DA:1594,384
+DA:1596,384
+DA:1599,384
+DA:1600,384
+DA:1601,384
+DA:1602,384
+DA:1605,384
+DA:1606,384
+DA:1607,384
+DA:1608,384
+DA:1609,384
+DA:1610,384
+DA:1611,384
+DA:1612,384
+DA:1613,384
+DA:1614,384
+DA:1615,0
+DA:1616,0
+DA:1617,0
+DA:1619,384
+DA:1623,2688
+DA:1624,2304
+DA:1625,2304
+DA:1626,2304
+DA:1627,11520
+DA:1628,9216
+DA:1629,9216
+DA:1630,9216
+DA:1631,32256
+DA:1632,23040
+DA:1633,23040
+DA:1634,23040
+DA:1635,9216
+DA:1636,9216
+DA:1638,23040
+DA:1639,23040
+DA:1641,9216
+DA:1645,384
+DA:1646,384
+DA:1647,384
+DA:1648,384
+DA:1649,384
+DA:1650,384
+DA:1651,384
+DA:1652,384
+DA:1673,0
+DA:1675,0
+DA:1677,0
+DA:1680,0
+DA:1681,0
+DA:1682,0
+DA:1683,0
+DA:1684,0
+DA:1687,0
+DA:1688,0
+DA:1689,0
+DA:1691,0
+DA:1693,0
+DA:1694,0
+DA:1695,0
+DA:1696,0
+DA:1697,0
+DA:1698,0
+DA:1699,0
+DA:1700,0
+DA:1701,0
+DA:1702,0
+DA:1703,0
+DA:1704,0
+DA:1705,0
+DA:1706,0
+DA:1707,0
+DA:1708,0
+DA:1709,0
+DA:1710,0
+DA:1711,0
+DA:1732,0
+DA:1734,0
+DA:1736,0
+DA:1739,0
+DA:1740,0
+DA:1741,0
+DA:1742,0
+DA:1743,0
+DA:1746,0
+DA:1747,0
+DA:1748,0
+DA:1750,0
+DA:1752,0
+DA:1753,0
+DA:1754,0
+DA:1755,0
+DA:1756,0
+DA:1757,0
+DA:1758,0
+DA:1759,0
+DA:1760,0
+DA:1761,0
+DA:1762,0
+DA:1763,0
+DA:1764,0
+DA:1765,0
+DA:1766,0
+DA:1767,0
+DA:1768,0
+DA:1769,0
+DA:1770,0
+DA:1791,0
+DA:1793,0
+DA:1795,0
+DA:1798,0
+DA:1799,0
+DA:1800,0
+DA:1801,0
+DA:1802,0
+DA:1805,0
+DA:1806,0
+DA:1807,0
+DA:1809,0
+DA:1810,0
+DA:1812,0
+DA:1813,0
+DA:1814,0
+DA:1815,0
+DA:1816,0
+DA:1817,0
+DA:1818,0
+DA:1819,0
+DA:1820,0
+DA:1821,0
+DA:1822,0
+DA:1823,0
+DA:1824,0
+DA:1825,0
+DA:1826,0
+DA:1827,0
+DA:1828,0
+DA:1829,0
+DA:1830,0
+DA:1856,0
+DA:1862,0
+DA:1865,0
+DA:1871,0
+DA:1872,0
+DA:1873,0
+DA:1875,0
+DA:1876,0
+DA:1877,0
+DA:1880,0
+DA:1881,0
+DA:1882,0
+DA:1888,0
+DA:1889,0
+DA:1890,0
+DA:1891,0
+DA:1895,0
+DA:1896,0
+DA:1897,0
+DA:1898,0
+DA:1914,6528
+DA:1915,6528
+DA:1916,6528
+DA:1917,6528
+DA:1918,6528
+DA:1931,0
+DA:1932,0
+DA:1933,0
+DA:1946,0
+DA:1947,0
+DA:1948,0
+DA:1961,0
+DA:1963,0
+DA:1968,0
+DA:1969,0
+DA:1970,0
+DA:1973,0
+DA:1975,0
+DA:1976,0
+DA:1977,0
+DA:1981,0
+DA:1982,0
+DA:1983,0
+DA:1985,0
+DA:1987,0
+DA:1989,0
+DA:1993,0
+DA:1994,0
+DA:1995,0
+DA:1996,0
+DA:1997,0
+DA:1998,0
+DA:2000,0
+DA:2001,0
+DA:2002,0
+DA:2003,0
+DA:2008,0
+DA:2009,0
+DA:2010,0
+DA:2011,0
+DA:2012,0
+DA:2013,0
+DA:2014,0
+DA:2015,0
+DA:2016,0
+DA:2018,0
+DA:2019,0
+DA:2020,0
+DA:2021,0
+DA:2022,0
+DA:2023,0
+DA:2024,0
+DA:2026,0
+DA:2027,0
+DA:2028,0
+DA:2030,0
+DA:2031,0
+DA:2032,0
+DA:2035,0
+DA:2036,0
+DA:2060,4800
+DA:2061,4800
+DA:2062,4800
+DA:2063,4800
+DA:2064,4800
+DA:2088,0
+DA:2089,0
+DA:2090,0
+DA:2091,0
+DA:2092,0
+DA:2093,0
+DA:2116,0
+DA:2118,0
+DA:2119,0
+DA:2120,0
+DA:2122,0
+DA:2124,0
+DA:2147,0
+DA:2149,0
+DA:2150,0
+DA:2151,0
+DA:2152,0
+DA:2154,0
+DA:2156,0
+DA:2169,768
+DA:2170,768
+DA:2171,768
+DA:2183,0
+DA:2195,20736
+DA:2196,20736
+DA:2197,20736
+DA:2210,0
+DA:2211,0
+DA:2212,0
+DA:2225,13248
+DA:2226,13248
+DA:2227,13248
+DA:2240,10176
+DA:2241,10176
+DA:2242,10176
+DA:2255,5184
+DA:2256,5184
+DA:2257,5184
+DA:2258,5184
+DA:2271,10368
+DA:2272,10368
+DA:2273,10368
+DA:2286,5184
+DA:2287,5184
+DA:2288,5184
+DA:2289,5184
+DA:2302,0
+DA:2303,0
+DA:2304,0
+DA:2317,192
+DA:2318,192
+DA:2319,192
+DA:2332,0
+DA:2333,0
+DA:2335,0
+DA:2338,0
+DA:2341,0
+DA:2342,0
+DA:2343,0
+DA:2344,0
+DA:2345,0
+DA:2347,0
+DA:2352,0
+DA:2353,0
+DA:2366,4992
+DA:2369,4992
+DA:2370,4992
+DA:2371,4992
+DA:2372,4992
+DA:2385,0
+DA:2386,0
+DA:2388,0
+DA:2391,0
+DA:2394,0
+DA:2395,0
+DA:2396,0
+DA:2397,0
+DA:2398,0
+DA:2399,0
+DA:2401,0
+DA:2402,0
+DA:2408,0
+DA:2409,0
+DA:2422,384
+DA:2425,384
+DA:2426,384
+DA:2427,384
+DA:2428,384
+DA:2441,0
+DA:2442,0
+DA:2443,0
+DA:2456,0
+DA:2457,0
+DA:2458,0
+DA:2470,13440
+DA:2471,13440
+DA:2472,13056
+DA:2473,13056
+DA:2475,384
+DA:2476,384
+DA:2477,384
+DA:2478,384
+DA:2479,384
+DA:2480,384
+DA:2481,384
+DA:2482,384
+DA:2483,384
+DA:2484,384
+DA:2485,384
+DA:2486,384
+DA:2487,384
+DA:2488,384
+DA:2489,384
+DA:2503,384
+DA:2504,384
+DA:2507,1920
+DA:2509,1536
+DA:2511,1536
+DA:2512,1536
+DA:2513,1536
+DA:2514,9216
+DA:2515,7680
+DA:2516,7680
+DA:2517,7680
+DA:2520,1536
+DA:2521,1536
+DA:2523,7296
+DA:2524,5760
+DA:2525,5760
+DA:2526,34560
+DA:2527,28800
+DA:2528,28800
+DA:2529,28800
+DA:2531,5760
+DA:2532,5760
+DA:2535,1536
+DA:2536,1536
+DA:2537,1536
+DA:2538,1536
+DA:2539,1536
+DA:2541,384
+DA:2555,576
+DA:2556,576
+DA:2560,576
+DA:2561,576
+DA:2562,576
+DA:2563,96
+DA:2564,96
+DA:2566,576
+DA:2567,576
+DA:2569,1728
+DA:2571,1152
+DA:2573,1152
+DA:2574,1152
+DA:2575,1152
+DA:2576,4608
+DA:2577,3456
+DA:2578,3456
+DA:2579,3456
+DA:2582,1152
+DA:2583,1152
+DA:2584,1152
+DA:2586,14976
+DA:2587,13824
+DA:2588,13824
+DA:2589,55296
+DA:2590,41472
+DA:2591,41472
+DA:2592,41472
+DA:2594,13824
+DA:2595,13824
+DA:2596,13824
+DA:2599,1152
+DA:2600,1152
+DA:2601,192
+DA:2602,192
+DA:2604,1152
+DA:2605,1152
+DA:2607,576
+LF:1024
+LH:339
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-config.c
+FNL:0,33,36
+FNA:0,0,CeedGetGitVersion
+FNL:1,50,53
+FNA:1,0,CeedSetIsClang
+FNL:2,67,70
+FNA:2,0,CeedGetIsClang
+FNL:3,85,88
+FNA:3,0,CeedGetBuildConfiguration
+FNF:4
+FNH:0
+DA:33,0
+DA:34,0
+DA:35,0
+DA:50,0
+DA:51,0
+DA:52,0
+DA:67,0
+DA:68,0
+DA:69,0
+DA:85,0
+DA:86,0
+DA:87,0
+LF:12
+LH:0
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-elemrestriction.c
+FNL:0,1018,1060
+FNA:0,0,CeedElemRestrictionCreateBlockedOriented
+FNL:1,1090,1133
+FNA:1,0,CeedElemRestrictionCreateBlockedCurlOriented
+FNL:2,111,114
+FNA:2,0,CeedElemRestrictionView_Object
+FNL:3,1154,1190
+FNA:3,288,CeedElemRestrictionCreateBlockedStrided
+FNL:4,1204,1221
+FNA:4,0,CeedElemRestrictionCreateUnsignedCopy
+FNL:5,1235,1252
+FNA:5,0,CeedElemRestrictionCreateUnorientedCopy
+FNL:6,125,128
+FNA:6,0,CeedElemRestrictionDestroy_Object
+FNL:7,1269,1274
+FNA:7,11568,CeedElemRestrictionReferenceCopy
+FNL:8,1287,1298
+FNA:8,960,CeedElemRestrictionCreateVector
+FNL:9,1314,1336
+FNA:9,576,CeedElemRestrictionApply
+FNL:10,1353,1391
+FNA:10,0,CeedElemRestrictionApplyAtPointsInElement
+FNL:11,1408,1446
+FNA:11,3072,CeedElemRestrictionApplyBlock
+FNL:12,1458,1461
+FNA:12,2976,CeedElemRestrictionGetCeed
+FNL:13,1472,1472
+FNA:13,0,CeedElemRestrictionReturnCeed
+FNL:14,148,151
+FNA:14,7296,CeedElemRestrictionGetType
+FNL:15,1484,1487
+FNA:15,5664,CeedElemRestrictionGetCompStride
+FNL:16,1499,1502
+FNA:16,10656,CeedElemRestrictionGetNumElements
+FNL:17,1514,1517
+FNA:17,16992,CeedElemRestrictionGetElementSize
+FNL:18,1530,1539
+FNA:18,0,CeedElemRestrictionGetNumPoints
+FNL:19,1553,1565
+FNA:19,0,CeedElemRestrictionGetNumPointsInElement
+FNL:20,1578,1605
+FNA:20,0,CeedElemRestrictionGetMinMaxPointsInElement
+FNL:21,1619,1621
+FNA:21,0,CeedElemRestrictionGetMaxPointsInElement
+FNL:22,163,166
+FNA:22,0,CeedElemRestrictionIsStrided
+FNL:23,1635,1637
+FNA:23,0,CeedElemRestrictionGetMinPointsInElement
+FNL:24,1649,1652
+FNA:24,6432,CeedElemRestrictionGetLVectorSize
+FNL:25,1664,1667
+FNA:25,1536,CeedElemRestrictionGetEVectorSize
+FNL:26,1679,1682
+FNA:26,9888,CeedElemRestrictionGetNumComponents
+FNL:27,1694,1697
+FNA:27,1872,CeedElemRestrictionGetNumBlocks
+FNL:28,1709,1712
+FNA:28,8016,CeedElemRestrictionGetBlockSize
+FNL:29,1724,1739
+FNA:29,0,CeedElemRestrictionGetMultiplicity
+FNL:30,1751,1754
+FNA:30,0,CeedElemRestrictionSetNumViewTabs
+FNL:31,1766,1769
+FNA:31,0,CeedElemRestrictionGetNumViewTabs
+FNL:32,178,181
+FNA:32,0,CeedElemRestrictionIsAtPoints
+FNL:33,1781,1818
+FNA:33,0,CeedElemRestrictionView
+FNL:34,1829,1845
+FNA:34,25152,CeedElemRestrictionDestroy
+FNL:35,194,222
+FNA:35,0,CeedElemRestrictionAtPointsAreCompatible
+FNL:36,234,238
+FNA:36,288,CeedElemRestrictionGetStrides
+FNL:37,250,255
+FNA:37,1536,CeedElemRestrictionHasBackendStrides
+FNL:38,269,279
+FNA:38,432,CeedElemRestrictionGetOffsets
+FNL:39,291,299
+FNA:39,432,CeedElemRestrictionRestoreOffsets
+FNL:40,313,319
+FNA:40,0,CeedElemRestrictionGetOrientations
+FNL:41,331,335
+FNA:41,0,CeedElemRestrictionRestoreOrientations
+FNL:42,349,355
+FNA:42,0,CeedElemRestrictionGetCurlOrientations
+FNL:43,367,371
+FNA:43,0,CeedElemRestrictionRestoreCurlOrientations
+FNL:44,38,48
+FNA:44,432,CeedPermutePadOffsets
+FNL:45,385,400
+FNA:45,0,CeedElemRestrictionGetLLayout
+FNL:46,414,422
+FNA:46,480,CeedElemRestrictionSetLLayout
+FNL:47,436,440
+FNA:47,0,CeedElemRestrictionGetELayout
+FNL:48,454,457
+FNA:48,1296,CeedElemRestrictionSetELayout
+FNL:49,472,496
+FNA:49,0,CeedElemRestrictionGetAtPointsElementOffset
+FNL:50,509,521
+FNA:50,0,CeedElemRestrictionSetAtPointsEVectorSize
+FNL:51,533,536
+FNA:51,7680,CeedElemRestrictionGetData
+FNL:52,548,551
+FNA:52,1296,CeedElemRestrictionSetData
+FNL:53,562,565
+FNA:53,9456,CeedElemRestrictionReference
+FNL:54,576,615
+FNA:54,0,CeedElemRestrictionGetFlopsEstimate
+FNL:55,64,73
+FNA:55,0,CeedPermutePadOrients
+FNL:56,657,687
+FNA:56,720,CeedElemRestrictionCreate
+FNL:57,712,744
+FNA:57,0,CeedElemRestrictionCreateOriented
+FNL:58,770,802
+FNA:58,0,CeedElemRestrictionCreateCurlOriented
+FNL:59,824,857
+FNA:59,360,CeedElemRestrictionCreateStrided
+FNL:60,89,99
+FNA:60,0,CeedPermutePadCurlOrients
+FNL:61,891,923
+FNA:61,0,CeedElemRestrictionCreateAtPoints
+FNL:62,950,989
+FNA:62,432,CeedElemRestrictionCreateBlocked
+FNF:63
+FNH:29
+DA:38,432
+DA:40,3024
+DA:41,9216
+DA:42,166464
+DA:43,159840
+DA:47,432
+DA:64,0
+DA:65,0
+DA:66,0
+DA:67,0
+DA:68,0
+DA:72,0
+DA:89,0
+DA:91,0
+DA:92,0
+DA:93,0
+DA:94,0
+DA:98,0
+DA:111,0
+DA:112,0
+DA:113,0
+DA:125,0
+DA:126,0
+DA:127,0
+DA:148,7296
+DA:149,7296
+DA:150,7296
+DA:163,0
+DA:164,0
+DA:165,0
+DA:178,0
+DA:179,0
+DA:180,0
+DA:194,0
+DA:198,0
+DA:200,0
+DA:203,0
+DA:204,0
+DA:205,0
+DA:206,0
+DA:209,0
+DA:210,0
+DA:211,0
+DA:212,0
+DA:215,0
+DA:216,0
+DA:217,0
+DA:218,0
+DA:219,0
+DA:221,0
+DA:234,288
+DA:235,288
+DA:236,1152
+DA:237,288
+DA:250,1536
+DA:251,1536
+DA:252,3072
+DA:253,1536
+DA:254,1536
+DA:269,432
+DA:270,432
+DA:271,0
+DA:273,432
+DA:275,432
+DA:276,432
+DA:278,432
+DA:291,432
+DA:292,432
+DA:293,0
+DA:295,432
+DA:296,432
+DA:298,432
+DA:313,0
+DA:314,0
+DA:316,0
+DA:317,0
+DA:318,0
+DA:331,0
+DA:332,0
+DA:333,0
+DA:334,0
+DA:349,0
+DA:350,0
+DA:352,0
+DA:353,0
+DA:354,0
+DA:367,0
+DA:368,0
+DA:369,0
+DA:370,0
+DA:385,0
+DA:389,0
+DA:390,0
+DA:392,0
+DA:393,0
+DA:394,0
+DA:395,0
+DA:397,0
+DA:399,0
+DA:414,480
+DA:417,480
+DA:418,480
+DA:420,1920
+DA:421,480
+DA:436,0
+DA:437,0
+DA:438,0
+DA:439,0
+DA:454,1296
+DA:455,5184
+DA:456,1296
+DA:472,0
+DA:476,0
+DA:477,0
+DA:481,0
+DA:482,0
+DA:483,0
+DA:487,0
+DA:488,0
+DA:489,0
+DA:492,0
+DA:493,0
+DA:495,0
+DA:509,0
+DA:512,0
+DA:513,0
+DA:515,0
+DA:519,0
+DA:520,0
+DA:533,7680
+DA:534,7680
+DA:535,7680
+DA:548,1296
+DA:549,1296
+DA:550,1296
+DA:562,9456
+DA:563,9456
+DA:564,9456
+DA:576,0
+DA:577,0
+DA:580,0
+DA:581,0
+DA:582,0
+DA:583,0
+DA:584,0
+DA:585,0
+DA:586,0
+DA:587,0
+DA:589,0
+DA:590,0
+DA:591,0
+DA:592,0
+DA:593,0
+DA:594,0
+DA:595,0
+DA:596,0
+DA:599,0
+DA:600,0
+DA:603,0
+DA:604,0
+DA:605,0
+DA:606,0
+DA:607,0
+DA:608,0
+DA:609,0
+DA:610,0
+DA:613,0
+DA:614,0
+DA:657,720
+DA:659,720
+DA:662,336
+DA:663,336
+DA:664,336
+DA:665,336
+DA:666,336
+DA:669,384
+DA:670,384
+DA:671,384
+DA:672,384
+DA:674,384
+DA:675,384
+DA:676,384
+DA:677,384
+DA:678,384
+DA:679,384
+DA:680,384
+DA:681,384
+DA:682,384
+DA:683,384
+DA:684,384
+DA:685,384
+DA:686,384
+DA:712,0
+DA:715,0
+DA:718,0
+DA:719,0
+DA:720,0
+DA:722,0
+DA:723,0
+DA:726,0
+DA:727,0
+DA:728,0
+DA:729,0
+DA:731,0
+DA:732,0
+DA:733,0
+DA:734,0
+DA:735,0
+DA:736,0
+DA:737,0
+DA:738,0
+DA:739,0
+DA:740,0
+DA:741,0
+DA:742,0
+DA:743,0
+DA:770,0
+DA:773,0
+DA:776,0
+DA:777,0
+DA:778,0
+DA:780,0
+DA:781,0
+DA:784,0
+DA:785,0
+DA:786,0
+DA:787,0
+DA:789,0
+DA:790,0
+DA:791,0
+DA:792,0
+DA:793,0
+DA:794,0
+DA:795,0
+DA:796,0
+DA:797,0
+DA:798,0
+DA:799,0
+DA:800,0
+DA:801,0
+DA:824,360
+DA:826,360
+DA:829,168
+DA:830,168
+DA:831,168
+DA:832,168
+DA:833,168
+DA:836,192
+DA:837,192
+DA:838,192
+DA:839,192
+DA:843,192
+DA:844,192
+DA:845,192
+DA:846,192
+DA:847,192
+DA:848,192
+DA:849,192
+DA:850,192
+DA:851,192
+DA:852,192
+DA:853,192
+DA:854,768
+DA:855,192
+DA:856,192
+DA:891,0
+DA:893,0
+DA:896,0
+DA:897,0
+DA:898,0
+DA:899,0
+DA:900,0
+DA:903,0
+DA:904,0
+DA:905,0
+DA:906,0
+DA:910,0
+DA:911,0
+DA:912,0
+DA:913,0
+DA:914,0
+DA:915,0
+DA:916,0
+DA:917,0
+DA:918,0
+DA:919,0
+DA:920,0
+DA:921,0
+DA:922,0
+DA:950,432
+DA:953,432
+DA:955,432
+DA:958,0
+DA:959,0
+DA:960,0
+DA:962,0
+DA:963,0
+DA:966,432
+DA:967,432
+DA:968,432
+DA:969,432
+DA:970,432
+DA:972,432
+DA:973,432
+DA:975,432
+DA:976,432
+DA:977,432
+DA:978,432
+DA:979,432
+DA:980,432
+DA:981,432
+DA:982,432
+DA:983,432
+DA:984,432
+DA:985,432
+DA:986,432
+DA:987,432
+DA:988,432
+DA:1018,0
+DA:1022,0
+DA:1024,0
+DA:1027,0
+DA:1028,0
+DA:1029,0
+DA:1031,0
+DA:1032,0
+DA:1035,0
+DA:1036,0
+DA:1037,0
+DA:1038,0
+DA:1040,0
+DA:1041,0
+DA:1042,0
+DA:1043,0
+DA:1045,0
+DA:1046,0
+DA:1047,0
+DA:1048,0
+DA:1049,0
+DA:1050,0
+DA:1051,0
+DA:1052,0
+DA:1053,0
+DA:1054,0
+DA:1055,0
+DA:1056,0
+DA:1058,0
+DA:1059,0
+DA:1090,0
+DA:1094,0
+DA:1096,0
+DA:1099,0
+DA:1100,0
+DA:1101,0
+DA:1103,0
+DA:1104,0
+DA:1107,0
+DA:1108,0
+DA:1109,0
+DA:1110,0
+DA:1111,0
+DA:1113,0
+DA:1114,0
+DA:1115,0
+DA:1116,0
+DA:1118,0
+DA:1119,0
+DA:1120,0
+DA:1121,0
+DA:1122,0
+DA:1123,0
+DA:1124,0
+DA:1125,0
+DA:1126,0
+DA:1127,0
+DA:1128,0
+DA:1129,0
+DA:1131,0
+DA:1132,0
+DA:1154,288
+DA:1156,288
+DA:1158,288
+DA:1161,0
+DA:1162,0
+DA:1163,0
+DA:1164,0
+DA:1165,0
+DA:1168,288
+DA:1169,288
+DA:1170,288
+DA:1171,288
+DA:1172,288
+DA:1176,288
+DA:1177,288
+DA:1178,288
+DA:1179,288
+DA:1180,288
+DA:1181,288
+DA:1182,288
+DA:1183,288
+DA:1184,288
+DA:1185,288
+DA:1186,288
+DA:1187,1152
+DA:1188,288
+DA:1189,288
+DA:1204,0
+DA:1205,0
+DA:1208,0
+DA:1209,0
+DA:1211,0
+DA:1212,0
+DA:1213,0
+DA:1214,0
+DA:1216,0
+DA:1219,0
+DA:1220,0
+DA:1235,0
+DA:1236,0
+DA:1239,0
+DA:1240,0
+DA:1242,0
+DA:1243,0
+DA:1244,0
+DA:1245,0
+DA:1247,0
+DA:1250,0
+DA:1251,0
+DA:1269,11568
+DA:1270,11568
+DA:1271,11568
+DA:1272,11568
+DA:1273,11568
+DA:1287,960
+DA:1291,960
+DA:1292,960
+DA:1293,960
+DA:1294,960
+DA:1295,960
+DA:1296,960
+DA:1297,960
+DA:1314,576
+DA:1318,576
+DA:1319,384
+DA:1320,384
+DA:1322,192
+DA:1323,192
+DA:1325,576
+DA:1326,576
+DA:1329,576
+DA:1330,576
+DA:1333,576
+DA:1334,576
+DA:1335,576
+DA:1353,0
+DA:1358,0
+DA:1361,0
+DA:1364,0
+DA:1365,0
+DA:1366,0
+DA:1367,0
+DA:1371,0
+DA:1372,0
+DA:1373,0
+DA:1374,0
+DA:1376,0
+DA:1377,0
+DA:1381,0
+DA:1382,0
+DA:1386,0
+DA:1387,0
+DA:1389,0
+DA:1390,0
+DA:1408,3072
+DA:1413,3072
+DA:1416,3072
+DA:1417,3072
+DA:1420,1536
+DA:1421,1536
+DA:1422,1536
+DA:1423,1536
+DA:1427,1536
+DA:1428,1536
+DA:1429,1536
+DA:1430,1536
+DA:1432,3072
+DA:1433,3072
+DA:1436,3072
+DA:1437,3072
+DA:1440,3072
+DA:1441,3072
+DA:1444,3072
+DA:1445,3072
+DA:1458,2976
+DA:1459,2976
+DA:1460,2976
+DA:1472,0
+DA:1484,5664
+DA:1485,5664
+DA:1486,5664
+DA:1499,10656
+DA:1500,10656
+DA:1501,10656
+DA:1514,16992
+DA:1515,16992
+DA:1516,16992
+DA:1530,0
+DA:1533,0
+DA:1534,0
+DA:1537,0
+DA:1538,0
+DA:1553,0
+DA:1557,0
+DA:1558,0
+DA:1561,0
+DA:1562,0
+DA:1563,0
+DA:1564,0
+DA:1578,0
+DA:1582,0
+DA:1583,0
+DA:1586,0
+DA:1589,0
+DA:1590,0
+DA:1591,0
+DA:1592,0
+DA:1596,0
+DA:1597,0
+DA:1598,0
+DA:1599,0
+DA:1600,0
+DA:1601,0
+DA:1602,0
+DA:1604,0
+DA:1619,0
+DA:1620,0
+DA:1635,0
+DA:1636,0
+DA:1649,6432
+DA:1650,6432
+DA:1651,6432
+DA:1664,1536
+DA:1665,1536
+DA:1666,1536
+DA:1679,9888
+DA:1680,9888
+DA:1681,9888
+DA:1694,1872
+DA:1695,1872
+DA:1696,1872
+DA:1709,8016
+DA:1710,8016
+DA:1711,8016
+DA:1724,0
+DA:1728,0
+DA:1731,0
+DA:1732,0
+DA:1734,0
+DA:1735,0
+DA:1737,0
+DA:1738,0
+DA:1751,0
+DA:1752,0
+DA:1753,0
+DA:1766,0
+DA:1767,0
+DA:1768,0
+DA:1781,0
+DA:1782,0
+DA:1786,0
+DA:1788,0
+DA:1789,0
+DA:1790,0
+DA:1793,0
+DA:1794,0
+DA:1797,0
+DA:1798,0
+DA:1805,0
+DA:1806,0
+DA:1808,0
+DA:1810,0
+DA:1813,0
+DA:1814,0
+DA:1816,0
+DA:1817,0
+DA:1829,25152
+DA:1830,25152
+DA:1831,24144
+DA:1832,24144
+DA:1834,1008
+DA:1838,1008
+DA:1839,1008
+DA:1841,1008
+DA:1842,1008
+DA:1843,1008
+DA:1844,1008
+LF:596
+LH:225
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-fortran.c
+FNL:0,1031,1036
+FNA:0,0,ceedoperatorcompositeaddsub_
+FNL:1,1039,1044
+FNA:1,0,ceedoperatorsetname_
+FNL:2,1047,1051
+FNA:2,0,ceedoperatorsetnumviewtabs_
+FNL:3,1054,1098
+FNA:3,0,ceedoperatorlinearassembleqfunction_
+FNL:4,1101,1124
+FNA:4,0,ceedoperatorlinearassemblediagonal_
+FNL:5,1127,1148
+FNA:5,0,ceedoperatormultigridlevelcreate_
+FNL:6,1151,1172
+FNA:6,0,ceedoperatormultigridlevelcreatetensorh1_
+FNL:7,1175,1196
+FNA:7,0,ceedoperatormultigridlevelcreateh1_
+FNL:8,119,132
+FNA:8,384,ceedvectorcreate_
+FNL:9,1199,1203
+FNA:9,0,ceedoperatorview_
+FNL:10,1206,1241
+FNA:10,0,ceedoperatorcreatefdmelementinverse_
+FNL:11,1244,1270
+FNA:11,192,ceedoperatorapply_
+FNL:12,1273,1299
+FNA:12,0,ceedoperatorapplyadd_
+FNL:13,1302,1307
+FNA:13,0,ceedoperatorapplyjacobian_
+FNL:14,1310,1322
+FNA:14,192,ceedoperatordestroy_
+FNL:15,135,137
+FNA:15,288,ceedvectorsetarray_
+FNL:16,140,145
+FNA:16,0,ceedvectortakearray_
+FNL:17,148,148
+FNA:17,0,ceedvectorsyncarray_
+FNL:18,151,151
+FNA:18,96,ceedvectorsetvalue_
+FNL:19,154,159
+FNA:19,0,ceedvectorgetarray_
+FNL:20,162,167
+FNA:20,96,ceedvectorgetarrayread_
+FNL:21,170,175
+FNA:21,0,ceedvectorgetarraywrite_
+FNL:22,178,182
+FNA:22,0,ceedvectorrestorearray_
+FNL:23,185,188
+FNA:23,96,ceedvectorrestorearrayread_
+FNL:24,191,193
+FNA:24,0,ceedvectornorm_
+FNL:25,196,196
+FNA:25,0,ceedvectorreciprocal_
+FNL:26,199,199
+FNA:26,0,ceedvectorsetnumviewtabs_
+FNL:27,202,202
+FNA:27,0,ceedvectorview_
+FNL:28,205,218
+FNA:28,384,ceedvectordestroy_
+FNL:29,229,246
+FNA:29,192,ceedelemrestrictioncreate_
+FNL:30,249,267
+FNA:30,0,ceedelemrestrictioncreateoriented_
+FNL:31,270,289
+FNA:31,0,ceedelemrestrictioncreatecurloriented_
+FNL:32,292,307
+FNA:32,96,ceedelemrestrictioncreatestrided_
+FNL:33,310,327
+FNA:33,0,ceedelemrestrictioncreateblocked_
+FNL:34,330,349
+FNA:34,0,ceedelemrestrictioncreateblockedoriented_
+FNL:35,353,372
+FNA:35,0,ceedelemrestrictioncreateblockedcurloriented_
+FNL:36,375,389
+FNA:36,0,ceedelemrestrictioncreateblockedstrided_
+FNL:37,397,419
+FNA:37,0,ceedelemrestrictionapply_
+FNL:38,422,444
+FNA:38,0,ceedelemrestrictionapplyblock_
+FNL:39,447,449
+FNA:39,0,ceedelemrestrictiongetmultiplicity_
+FNL:40,452,456
+FNA:40,0,ceedelemrestrictiongetelayout_
+FNL:41,459,461
+FNA:41,0,ceedelemrestrictionsetnumviewtabs_
+FNL:42,464,464
+FNA:42,0,ceedelemrestrictionview_
+FNL:43,467,479
+FNA:43,0,ceedrequestwait_
+FNL:44,482,495
+FNA:44,0,ceedelemrestrictiondestroy_
+FNL:45,506,518
+FNA:45,192,ceedbasiscreatetensorh1lagrange_
+FNL:46,521,536
+FNA:46,0,ceedbasiscreatetensorh1_
+FNL:47,539,553
+FNA:47,0,ceedbasiscreateh1_
+FNL:48,556,570
+FNA:48,0,ceedbasiscreatehdiv_
+FNL:49,573,587
+FNA:49,0,ceedbasiscreatehcurl_
+FNL:50,590,590
+FNA:50,0,ceedbasissetnumviewtabs_
+FNL:51,593,593
+FNA:51,0,ceedbasisview_
+FNL:52,596,598
+FNA:52,0,ceedbasisgetcollocatedgrad_
+FNL:53,601,604
+FNA:53,0,ceedbasisapply_
+FNL:54,607,607
+FNA:54,0,ceedbasisgetnumnodes_
+FNL:55,610,610
+FNA:55,0,ceedbasisgetnumquadraturepoints_
+FNL:56,613,618
+FNA:56,0,ceedbasisgetinterp1d_
+FNL:57,621,626
+FNA:57,0,ceedbasisgetgrad1d_
+FNL:58,629,634
+FNA:58,0,ceedbasisgetqref_
+FNL:59,637,650
+FNA:59,192,ceedbasisdestroy_
+FNL:60,64,78
+FNA:60,96,ceedinit_
+FNL:61,653,655
+FNA:61,0,ceedgaussquadrature_
+FNL:62,658,660
+FNA:62,96,ceedlobattoquadrature_
+FNL:63,671,683
+FNA:63,96,ceedqfunctioncontextcreate_
+FNL:64,686,689
+FNA:64,96,ceedqfunctioncontextsetdata_
+FNL:65,692,697
+FNA:65,0,ceedqfunctioncontextgetdata_
+FNL:66,700,703
+FNA:66,0,ceedqfunctioncontextrestoredata_
+FNL:67,706,708
+FNA:67,0,ceedqfunctioncontextsetnumviewtabs_
+FNL:68,711,711
+FNA:68,0,ceedqfunctioncontextview_
+FNL:69,714,727
+FNA:69,96,ceedqfunctioncontextdestroy_
+FNL:70,737,760
+FNA:70,768,CeedQFunctionFortranStub
+FNL:71,763,801
+FNA:71,96,ceedqfunctioncreateinterior_
+FNL:72,804,818
+FNA:72,96,ceedqfunctioncreateinteriorbyname_
+FNL:73,81,83
+FNA:73,0,ceedisdeterministic_
+FNL:74,821,834
+FNA:74,0,ceedqfunctioncreateidentity_
+FNL:75,837,843
+FNA:75,192,ceedqfunctionaddinput_
+FNL:76,846,852
+FNA:76,96,ceedqfunctionaddoutput_
+FNL:77,855,869
+FNA:77,48,ceedqfunctionsetcontext_
+FNL:78,86,86
+FNA:78,0,ceedgetpreferredmemtype_
+FNL:79,872,874
+FNA:79,0,ceedqfunctionsetnumviewtabs_
+FNL:80,877,881
+FNA:80,0,ceedqfunctionview_
+FNL:81,885,933
+FNA:81,0,ceedqfunctionapply_
+FNL:82,89,89
+FNA:82,0,ceedsetnumviewtabs_
+FNL:83,92,92
+FNA:83,0,ceedview_
+FNL:84,936,949
+FNA:84,192,ceedqfunctiondestroy_
+FNL:85,95,108
+FNA:85,96,ceeddestroy_
+FNL:86,960,976
+FNA:86,192,ceedoperatorcreate_
+FNL:87,979,991
+FNA:87,0,ceedoperatorcreatecomposite_
+FNL:88,994,1028
+FNA:88,576,ceedoperatorsetfield_
+FNF:89
+FNH:27
+DA:64,96
+DA:65,96
+DA:66,96
+DA:67,96
+DA:68,96
+DA:71,96
+DA:72,96
+DA:74,96
+DA:75,96
+DA:76,96
+DA:78,96
+DA:81,0
+DA:82,0
+DA:83,0
+DA:86,0
+DA:89,0
+DA:92,0
+DA:95,96
+DA:96,96
+DA:97,96
+DA:99,96
+DA:100,96
+DA:101,96
+DA:102,96
+DA:103,96
+DA:104,96
+DA:105,96
+DA:119,384
+DA:120,384
+DA:121,288
+DA:122,288
+DA:125,384
+DA:126,384
+DA:128,384
+DA:129,384
+DA:130,384
+DA:132,384
+DA:135,288
+DA:136,288
+DA:137,288
+DA:140,0
+DA:142,0
+DA:143,0
+DA:144,0
+DA:145,0
+DA:148,0
+DA:151,96
+DA:154,0
+DA:156,0
+DA:157,0
+DA:158,0
+DA:159,0
+DA:162,96
+DA:164,96
+DA:165,96
+DA:166,96
+DA:167,96
+DA:170,0
+DA:172,0
+DA:173,0
+DA:174,0
+DA:175,0
+DA:178,0
+DA:179,0
+DA:180,0
+DA:181,0
+DA:182,0
+DA:185,96
+DA:186,96
+DA:187,96
+DA:188,96
+DA:191,0
+DA:192,0
+DA:193,0
+DA:196,0
+DA:199,0
+DA:202,0
+DA:205,384
+DA:206,384
+DA:207,384
+DA:209,384
+DA:210,384
+DA:211,384
+DA:212,384
+DA:213,96
+DA:214,96
+DA:215,96
+DA:229,192
+DA:231,192
+DA:232,192
+DA:233,192
+DA:236,192
+DA:238,192
+DA:239,384
+DA:240,192
+DA:242,192
+DA:243,192
+DA:244,192
+DA:246,192
+DA:249,0
+DA:251,0
+DA:252,0
+DA:253,0
+DA:256,0
+DA:257,0
+DA:259,0
+DA:260,0
+DA:261,0
+DA:263,0
+DA:264,0
+DA:265,0
+DA:267,0
+DA:270,0
+DA:273,0
+DA:274,0
+DA:275,0
+DA:278,0
+DA:279,0
+DA:281,0
+DA:282,0
+DA:283,0
+DA:285,0
+DA:286,0
+DA:287,0
+DA:289,0
+DA:292,96
+DA:294,96
+DA:295,96
+DA:296,96
+DA:299,96
+DA:300,96
+DA:301,96
+DA:303,96
+DA:304,96
+DA:305,96
+DA:307,96
+DA:310,0
+DA:312,0
+DA:313,0
+DA:314,0
+DA:317,0
+DA:319,0
+DA:320,0
+DA:321,0
+DA:323,0
+DA:324,0
+DA:325,0
+DA:327,0
+DA:330,0
+DA:333,0
+DA:334,0
+DA:335,0
+DA:338,0
+DA:339,0
+DA:341,0
+DA:342,0
+DA:343,0
+DA:345,0
+DA:346,0
+DA:347,0
+DA:349,0
+DA:353,0
+DA:356,0
+DA:357,0
+DA:358,0
+DA:361,0
+DA:362,0
+DA:364,0
+DA:365,0
+DA:366,0
+DA:368,0
+DA:369,0
+DA:370,0
+DA:372,0
+DA:375,0
+DA:377,0
+DA:378,0
+DA:379,0
+DA:382,0
+DA:383,0
+DA:385,0
+DA:386,0
+DA:387,0
+DA:389,0
+DA:397,0
+DA:398,0
+DA:400,0
+DA:402,0
+DA:403,0
+DA:404,0
+DA:408,0
+DA:409,0
+DA:410,0
+DA:412,0
+DA:413,0
+DA:415,0
+DA:416,0
+DA:417,0
+DA:419,0
+DA:422,0
+DA:423,0
+DA:425,0
+DA:427,0
+DA:428,0
+DA:429,0
+DA:433,0
+DA:434,0
+DA:435,0
+DA:437,0
+DA:438,0
+DA:440,0
+DA:441,0
+DA:442,0
+DA:444,0
+DA:447,0
+DA:448,0
+DA:449,0
+DA:452,0
+DA:454,0
+DA:455,0
+DA:456,0
+DA:459,0
+DA:460,0
+DA:461,0
+DA:464,0
+DA:467,0
+DA:471,0
+DA:472,0
+DA:473,0
+DA:474,0
+DA:475,0
+DA:476,0
+DA:479,0
+DA:482,0
+DA:483,0
+DA:484,0
+DA:486,0
+DA:487,0
+DA:488,0
+DA:489,0
+DA:490,0
+DA:491,0
+DA:492,0
+DA:506,192
+DA:507,192
+DA:508,192
+DA:509,192
+DA:512,192
+DA:514,192
+DA:515,192
+DA:516,192
+DA:518,192
+DA:521,0
+DA:524,0
+DA:525,0
+DA:526,0
+DA:529,0
+DA:530,0
+DA:532,0
+DA:533,0
+DA:534,0
+DA:536,0
+DA:539,0
+DA:541,0
+DA:542,0
+DA:543,0
+DA:546,0
+DA:547,0
+DA:549,0
+DA:550,0
+DA:551,0
+DA:553,0
+DA:556,0
+DA:558,0
+DA:559,0
+DA:560,0
+DA:563,0
+DA:564,0
+DA:566,0
+DA:567,0
+DA:568,0
+DA:570,0
+DA:573,0
+DA:575,0
+DA:576,0
+DA:577,0
+DA:580,0
+DA:581,0
+DA:583,0
+DA:584,0
+DA:585,0
+DA:587,0
+DA:590,0
+DA:593,0
+DA:596,0
+DA:597,0
+DA:598,0
+DA:601,0
+DA:602,0
+DA:603,0
+DA:604,0
+DA:607,0
+DA:610,0
+DA:613,0
+DA:615,0
+DA:616,0
+DA:617,0
+DA:618,0
+DA:621,0
+DA:623,0
+DA:624,0
+DA:625,0
+DA:626,0
+DA:629,0
+DA:631,0
+DA:632,0
+DA:633,0
+DA:634,0
+DA:637,192
+DA:638,192
+DA:639,192
+DA:641,192
+DA:642,192
+DA:643,192
+DA:644,192
+DA:645,96
+DA:646,96
+DA:647,96
+DA:653,0
+DA:654,0
+DA:655,0
+DA:658,96
+DA:659,96
+DA:660,96
+DA:671,96
+DA:672,96
+DA:673,96
+DA:674,96
+DA:677,96
+DA:679,96
+DA:680,96
+DA:681,96
+DA:682,96
+DA:686,96
+DA:687,96
+DA:688,96
+DA:689,96
+DA:692,0
+DA:694,0
+DA:695,0
+DA:696,0
+DA:697,0
+DA:700,0
+DA:701,0
+DA:702,0
+DA:703,0
+DA:706,0
+DA:707,0
+DA:708,0
+DA:711,0
+DA:714,96
+DA:715,96
+DA:716,96
+DA:718,96
+DA:719,96
+DA:720,96
+DA:721,96
+DA:722,96
+DA:723,96
+DA:724,96
+DA:737,768
+DA:738,768
+DA:739,768
+DA:742,768
+DA:746,768
+DA:747,384
+DA:748,384
+DA:751,768
+DA:752,768
+DA:754,768
+DA:755,384
+DA:756,384
+DA:759,768
+DA:763,96
+DA:771,96
+DA:772,96
+DA:773,96
+DA:774,96
+DA:777,96
+DA:778,96
+DA:780,96
+DA:781,96
+DA:782,96
+DA:786,96
+DA:787,96
+DA:788,96
+DA:789,96
+DA:791,96
+DA:792,96
+DA:793,96
+DA:794,96
+DA:795,96
+DA:796,96
+DA:797,96
+DA:798,96
+DA:800,96
+DA:804,96
+DA:805,96
+DA:806,96
+DA:807,96
+DA:808,96
+DA:811,96
+DA:812,96
+DA:814,96
+DA:815,96
+DA:816,96
+DA:818,96
+DA:821,0
+DA:822,0
+DA:823,0
+DA:824,0
+DA:827,0
+DA:828,0
+DA:830,0
+DA:831,0
+DA:832,0
+DA:834,0
+DA:837,192
+DA:839,192
+DA:840,192
+DA:842,192
+DA:843,192
+DA:846,96
+DA:848,96
+DA:849,96
+DA:851,96
+DA:852,96
+DA:855,48
+DA:856,48
+DA:857,48
+DA:860,48
+DA:861,48
+DA:863,48
+DA:864,48
+DA:865,48
+DA:866,48
+DA:867,48
+DA:868,48
+DA:872,0
+DA:873,0
+DA:874,0
+DA:877,0
+DA:878,0
+DA:880,0
+DA:881,0
+DA:885,0
+DA:888,0
+DA:890,0
+DA:891,0
+DA:892,0
+DA:893,0
+DA:894,0
+DA:895,0
+DA:896,0
+DA:897,0
+DA:898,0
+DA:899,0
+DA:900,0
+DA:901,0
+DA:902,0
+DA:903,0
+DA:904,0
+DA:905,0
+DA:906,0
+DA:907,0
+DA:909,0
+DA:910,0
+DA:911,0
+DA:912,0
+DA:913,0
+DA:914,0
+DA:915,0
+DA:916,0
+DA:917,0
+DA:918,0
+DA:919,0
+DA:920,0
+DA:921,0
+DA:922,0
+DA:923,0
+DA:924,0
+DA:925,0
+DA:926,0
+DA:927,0
+DA:928,0
+DA:930,0
+DA:931,0
+DA:932,0
+DA:936,192
+DA:937,192
+DA:939,192
+DA:940,192
+DA:941,192
+DA:942,192
+DA:943,192
+DA:944,96
+DA:945,96
+DA:946,96
+DA:960,192
+DA:961,192
+DA:962,192
+DA:963,192
+DA:966,192
+DA:968,192
+DA:969,192
+DA:970,192
+DA:972,192
+DA:973,192
+DA:974,192
+DA:975,192
+DA:979,0
+DA:980,0
+DA:981,0
+DA:982,0
+DA:985,0
+DA:987,0
+DA:988,0
+DA:989,0
+DA:990,0
+DA:994,576
+DA:995,576
+DA:1000,576
+DA:1002,576
+DA:1003,0
+DA:1004,576
+DA:1005,96
+DA:1007,480
+DA:1010,576
+DA:1011,0
+DA:1012,576
+DA:1013,192
+DA:1015,384
+DA:1017,576
+DA:1018,0
+DA:1019,576
+DA:1020,384
+DA:1021,192
+DA:1022,96
+DA:1024,96
+DA:1027,576
+DA:1028,576
+DA:1031,0
+DA:1032,0
+DA:1033,0
+DA:1035,0
+DA:1036,0
+DA:1039,0
+DA:1040,0
+DA:1041,0
+DA:1043,0
+DA:1044,0
+DA:1047,0
+DA:1048,0
+DA:1050,0
+DA:1051,0
+DA:1054,0
+DA:1056,0
+DA:1057,0
+DA:1058,0
+DA:1060,0
+DA:1063,0
+DA:1064,0
+DA:1065,0
+DA:1067,0
+DA:1069,0
+DA:1071,0
+DA:1072,0
+DA:1075,0
+DA:1076,0
+DA:1077,0
+DA:1081,0
+DA:1082,0
+DA:1083,0
+DA:1085,0
+DA:1086,0
+DA:1087,0
+DA:1088,0
+DA:1089,0
+DA:1092,0
+DA:1093,0
+DA:1094,0
+DA:1095,0
+DA:1096,0
+DA:1101,0
+DA:1102,0
+DA:1104,0
+DA:1105,0
+DA:1108,0
+DA:1109,0
+DA:1110,0
+DA:1114,0
+DA:1115,0
+DA:1116,0
+DA:1118,0
+DA:1119,0
+DA:1120,0
+DA:1121,0
+DA:1122,0
+DA:1127,0
+DA:1133,0
+DA:1134,0
+DA:1136,0
+DA:1137,0
+DA:1138,0
+DA:1140,0
+DA:1141,0
+DA:1142,0
+DA:1143,0
+DA:1144,0
+DA:1145,0
+DA:1146,0
+DA:1147,0
+DA:1151,0
+DA:1157,0
+DA:1158,0
+DA:1160,0
+DA:1161,0
+DA:1162,0
+DA:1164,0
+DA:1165,0
+DA:1166,0
+DA:1167,0
+DA:1168,0
+DA:1169,0
+DA:1170,0
+DA:1171,0
+DA:1175,0
+DA:1181,0
+DA:1182,0
+DA:1184,0
+DA:1185,0
+DA:1186,0
+DA:1188,0
+DA:1189,0
+DA:1190,0
+DA:1191,0
+DA:1192,0
+DA:1193,0
+DA:1194,0
+DA:1195,0
+DA:1199,0
+DA:1200,0
+DA:1202,0
+DA:1203,0
+DA:1206,0
+DA:1208,0
+DA:1209,0
+DA:1210,0
+DA:1212,0
+DA:1214,0
+DA:1216,0
+DA:1217,0
+DA:1220,0
+DA:1221,0
+DA:1222,0
+DA:1226,0
+DA:1227,0
+DA:1228,0
+DA:1230,0
+DA:1231,0
+DA:1232,0
+DA:1233,0
+DA:1234,0
+DA:1237,0
+DA:1238,0
+DA:1239,0
+DA:1244,192
+DA:1245,192
+DA:1246,192
+DA:1248,192
+DA:1250,192
+DA:1251,192
+DA:1254,192
+DA:1255,0
+DA:1256,0
+DA:1260,192
+DA:1261,0
+DA:1262,0
+DA:1264,192
+DA:1265,192
+DA:1266,192
+DA:1267,0
+DA:1268,0
+DA:1273,0
+DA:1274,0
+DA:1275,0
+DA:1277,0
+DA:1279,0
+DA:1280,0
+DA:1283,0
+DA:1284,0
+DA:1285,0
+DA:1289,0
+DA:1290,0
+DA:1291,0
+DA:1293,0
+DA:1294,0
+DA:1295,0
+DA:1296,0
+DA:1297,0
+DA:1302,0
+DA:1307,0
+DA:1310,192
+DA:1311,192
+DA:1312,192
+DA:1313,192
+DA:1314,192
+DA:1315,192
+DA:1316,192
+DA:1317,96
+DA:1318,96
+DA:1319,96
+LF:722
+LH:252
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-jit-tools.c
+FNL:0,124,296
+FNA:0,0,CeedLoadSourceToInitializedBuffer
+FNL:1,27,61
+FNA:1,144,CeedCheckFilePath
+FNL:2,313,324
+FNA:2,0,CeedLoadSourceAndInitializeBuffer
+FNL:3,339,350
+FNA:3,0,CeedLoadSourceToBuffer
+FNL:4,368,377
+FNA:4,48,CeedPathConcatenate
+FNL:5,389,393
+FNA:5,1536,CeedGetJitRelativePath
+FNL:6,406,439
+FNA:6,48,CeedGetJitAbsolutePath
+FNL:7,74,107
+FNA:7,0,CeedNormalizePath
+FNF:8
+FNH:4
+DA:27,144
+DA:31,144
+DA:33,144
+DA:34,144
+DA:36,144
+DA:37,144
+DA:39,0
+DA:43,144
+DA:44,144
+DA:48,144
+DA:49,144
+DA:51,144
+DA:53,96
+DA:54,96
+DA:55,96
+DA:59,144
+DA:60,144
+DA:74,0
+DA:75,0
+DA:77,0
+DA:79,0
+DA:80,0
+DA:81,0
+DA:84,0
+DA:85,0
+DA:87,0
+DA:88,0
+DA:89,0
+DA:92,0
+DA:93,0
+DA:95,0
+DA:96,0
+DA:98,0
+DA:99,0
+DA:100,0
+DA:101,0
+DA:104,0
+DA:106,0
+DA:124,0
+DA:126,0
+DA:130,0
+DA:131,0
+DA:132,0
+DA:135,0
+DA:136,0
+DA:138,0
+DA:139,0
+DA:140,0
+DA:143,0
+DA:146,0
+DA:147,0
+DA:150,0
+DA:157,0
+DA:160,0
+DA:162,0
+DA:164,0
+DA:165,0
+DA:167,0
+DA:168,0
+DA:171,0
+DA:172,0
+DA:173,0
+DA:176,0
+DA:178,0
+DA:179,0
+DA:180,0
+DA:183,0
+DA:184,0
+DA:186,0
+DA:187,0
+DA:188,0
+DA:189,0
+DA:191,0
+DA:195,0
+DA:197,0
+DA:198,0
+DA:201,0
+DA:202,0
+DA:203,0
+DA:206,0
+DA:208,0
+DA:209,0
+DA:211,0
+DA:212,0
+DA:213,0
+DA:214,0
+DA:216,0
+DA:217,0
+DA:218,0
+DA:219,0
+DA:220,0
+DA:221,0
+DA:222,0
+DA:223,0
+DA:224,0
+DA:225,0
+DA:227,0
+DA:229,0
+DA:232,0
+DA:233,0
+DA:234,0
+DA:236,0
+DA:237,0
+DA:238,0
+DA:239,0
+DA:241,0
+DA:243,0
+DA:245,0
+DA:246,0
+DA:247,0
+DA:248,0
+DA:253,0
+DA:254,0
+DA:255,0
+DA:256,0
+DA:257,0
+DA:258,0
+DA:259,0
+DA:260,0
+DA:262,0
+DA:263,0
+DA:264,0
+DA:265,0
+DA:267,0
+DA:268,0
+DA:269,0
+DA:270,0
+DA:272,0
+DA:275,0
+DA:278,0
+DA:279,0
+DA:281,0
+DA:282,0
+DA:283,0
+DA:284,0
+DA:287,0
+DA:290,0
+DA:291,0
+DA:292,0
+DA:293,0
+DA:294,0
+DA:295,0
+DA:313,0
+DA:315,0
+DA:316,0
+DA:319,0
+DA:322,0
+DA:323,0
+DA:339,0
+DA:340,0
+DA:341,0
+DA:344,0
+DA:347,0
+DA:348,0
+DA:349,0
+DA:368,48
+DA:369,48
+DA:370,48
+DA:371,48
+DA:373,48
+DA:374,48
+DA:375,48
+DA:376,48
+DA:389,1536
+DA:390,1536
+DA:391,1536
+DA:392,1536
+DA:406,48
+DA:411,48
+DA:412,48
+DA:413,48
+DA:415,48
+DA:416,48
+DA:420,48
+DA:421,48
+DA:424,96
+DA:425,48
+DA:427,48
+DA:428,48
+DA:429,48
+LF:180
+LH:41
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-object.c
+FNL:0,108,111
+FNA:0,0,CeedObjectView
+FNL:1,123,127
+FNA:1,0,CeedObjectSetNumViewTabs
+FNL:2,139,142
+FNA:2,0,CeedObjectGetNumViewTabs
+FNL:3,154,158
+FNA:3,9888,CeedObjectGetCeed
+FNL:4,169,169
+FNA:4,11232,CeedObjectReturnCeed
+FNL:5,180,183
+FNA:5,0,CeedObjectDestroy
+FNL:6,37,45
+FNA:6,7200,CeedObjectCreate
+FNL:7,56,59
+FNA:7,49140,CeedObjectReference
+FNL:8,70,72
+FNA:8,55728,CeedObjectDereference
+FNL:9,83,88
+FNA:9,6780,CeedObjectDestroy_Private
+FNF:10
+FNH:6
+DA:37,7200
+DA:38,7200
+DA:39,7200
+DA:40,7200
+DA:41,7200
+DA:42,7200
+DA:43,7200
+DA:44,7200
+DA:56,49140
+DA:57,49140
+DA:58,49140
+DA:70,55728
+DA:71,55728
+DA:83,6780
+DA:84,6780
+DA:86,6780
+DA:87,6780
+DA:108,0
+DA:109,0
+DA:110,0
+DA:123,0
+DA:124,0
+DA:125,0
+DA:126,0
+DA:139,0
+DA:140,0
+DA:141,0
+DA:154,9888
+DA:155,9888
+DA:156,9888
+DA:157,9888
+DA:169,11232
+DA:180,0
+DA:181,0
+DA:182,0
+LF:35
+LH:22
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-operator.c
+FNL:0,1049,1064
+FNA:0,1920,CeedOperatorGetFields
+FNL:1,1079,1100
+FNA:1,0,CeedOperatorAtPointsSetPoints
+FNL:2,1112,1115
+FNA:2,1536,CeedOperatorIsAtPoints
+FNL:3,1130,1146
+FNA:3,0,CeedOperatorAtPointsGetPoints
+FNL:4,1163,1185
+FNA:4,0,CeedOperatorGetFieldByName
+FNL:5,1197,1200
+FNA:5,0,CeedOperatorFieldGetName
+FNL:6,1214,1218
+FNA:6,10416,CeedOperatorFieldGetElemRestriction
+FNL:7,1232,1236
+FNA:7,5376,CeedOperatorFieldGetBasis
+FNL:8,1250,1254
+FNA:8,7776,CeedOperatorFieldGetVector
+FNL:9,1273,1279
+FNA:9,0,CeedOperatorFieldGetData
+FNL:10,1291,1319
+FNA:10,0,CeedOperatorCompositeAddSub
+FNL:11,1331,1338
+FNA:11,0,CeedOperatorCompositeGetNumSub
+FNL:12,1350,1357
+FNA:12,0,CeedOperatorCompositeGetSubList
+FNL:13,1374,1391
+FNA:13,0,CeedOperatorCompositeGetSubByName
+FNL:14,138,175
+FNA:14,0,CeedOperatorSingleView
+FNL:15,1402,1450
+FNA:15,3072,CeedOperatorCheckReady
+FNL:16,1465,1494
+FNA:16,0,CeedOperatorGetActiveVectorLengths
+FNL:17,1509,1524
+FNA:17,0,CeedOperatorSetQFunctionAssemblyReuse
+FNL:18,1536,1556
+FNA:18,0,CeedOperatorSetQFunctionAssemblyDataUpdateNeeded
+FNL:19,1568,1579
+FNA:19,0,CeedOperatorSetName
+FNL:20,1591,1602
+FNA:20,0,CeedOperatorGetName
+FNL:21,1615,1652
+FNA:21,0,CeedOperatorView_Core
+FNL:22,1664,1667
+FNA:22,0,CeedOperatorSetNumViewTabs
+FNL:23,1679,1682
+FNA:23,0,CeedOperatorGetNumViewTabs
+FNL:24,1694,1697
+FNA:24,0,CeedOperatorView
+FNL:25,1709,1712
+FNA:25,0,CeedOperatorViewTerse
+FNL:26,1724,1727
+FNA:26,1536,CeedOperatorGetCeed
+FNL:27,1738,1738
+FNA:27,1152,CeedOperatorReturnCeed
+FNL:28,1750,1757
+FNA:28,384,CeedOperatorGetNumElements
+FNL:29,1769,1776
+FNA:29,768,CeedOperatorGetNumQuadraturePoints
+FNL:30,1786,1907
+FNA:30,0,CeedOperatorGetFlopsEstimate
+FNL:31,187,190
+FNA:31,0,CeedOperatorView_Object
+FNL:32,1924,1937
+FNA:32,0,CeedOperatorGetContext
+FNL:33,1952,2051
+FNA:33,0,CeedOperatorGetContextFieldLabel
+FNL:34,201,204
+FNA:34,0,CeedOperatorDestroy_Object
+FNL:35,2066,2068
+FNA:35,0,CeedOperatorSetContextDouble
+FNL:36,2084,2086
+FNA:36,0,CeedOperatorGetContextDoubleRead
+FNL:37,2099,2101
+FNA:37,0,CeedOperatorRestoreContextDoubleRead
+FNL:38,2116,2118
+FNA:38,0,CeedOperatorSetContextInt32
+FNL:39,2134,2136
+FNA:39,0,CeedOperatorGetContextInt32Read
+FNL:40,2149,2151
+FNA:40,0,CeedOperatorRestoreContextInt32Read
+FNL:41,2166,2168
+FNA:41,0,CeedOperatorSetContextBoolean
+FNL:42,218,221
+FNA:42,0,CeedOperatorGetActiveBasis
+FNL:43,2184,2186
+FNA:43,0,CeedOperatorGetContextBooleanRead
+FNL:44,2199,2201
+FNA:44,0,CeedOperatorRestoreContextBooleanRead
+FNL:45,2220,2241
+FNA:45,384,CeedOperatorApply
+FNL:46,2262,2287
+FNA:46,384,CeedOperatorApplyAdd
+FNL:47,2306,2354
+FNA:47,384,CeedOperatorApplyAddActive
+FNL:48,236,287
+FNA:48,0,CeedOperatorGetActiveBases
+FNL:49,2365,2383
+FNA:49,384,CeedOperatorAssemblyDataStrip
+FNL:50,2394,2466
+FNA:50,768,CeedOperatorDestroy
+FNL:51,301,304
+FNA:51,0,CeedOperatorGetActiveElemRestriction
+FNL:52,319,370
+FNA:52,0,CeedOperatorGetActiveElemRestrictions
+FNL:53,36,82
+FNA:53,1152,CeedOperatorCheckField
+FNL:54,387,432
+FNA:54,0,CeedOperatorContextSetGeneric
+FNL:55,450,500
+FNA:55,0,CeedOperatorContextGetGenericRead
+FNL:56,517,563
+FNA:56,0,CeedOperatorContextRestoreGenericRead
+FNL:57,583,590
+FNA:57,0,CeedOperatorGetNumArgs
+FNL:58,604,633
+FNA:58,0,CeedOperatorHasTensorBases
+FNL:59,645,648
+FNA:59,1152,CeedOperatorIsImmutable
+FNL:60,660,663
+FNA:60,384,CeedOperatorIsSetupDone
+FNL:61,675,683
+FNA:61,4224,CeedOperatorGetQFunction
+FNL:62,695,698
+FNA:62,10368,CeedOperatorIsComposite
+FNL:63,710,713
+FNA:63,1152,CeedOperatorGetData
+FNL:64,725,728
+FNA:64,384,CeedOperatorSetData
+FNL:65,739,742
+FNA:65,0,CeedOperatorReference
+FNL:66,753,756
+FNA:66,384,CeedOperatorSetSetupDone
+FNL:67,781,805
+FNA:67,576,CeedOperatorCreate
+FNL:68,823,848
+FNA:68,0,CeedOperatorCreateAtPoints
+FNL:69,860,881
+FNA:69,0,CeedOperatorCreateComposite
+FNL:70,898,903
+FNA:70,0,CeedOperatorReferenceCopy
+FNL:71,928,1032
+FNA:71,1152,CeedOperatorSetField
+FNL:72,98,125
+FNA:72,0,CeedOperatorFieldView
+FNF:73
+FNH:25
+DA:36,1152
+DA:38,1152
+DA:42,1152
+DA:45,1152
+DA:47,1152
+DA:48,960
+DA:51,1152
+DA:53,1152
+DA:54,768
+DA:55,768
+DA:56,768
+DA:57,768
+DA:63,1152
+DA:64,384
+DA:65,384
+DA:68,384
+DA:69,576
+DA:73,576
+DA:76,576
+DA:77,192
+DA:79,192
+DA:81,1152
+DA:98,0
+DA:100,0
+DA:108,0
+DA:109,0
+DA:111,0
+DA:116,0
+DA:117,0
+DA:118,0
+DA:119,0
+DA:120,0
+DA:122,0
+DA:123,0
+DA:124,0
+DA:138,0
+DA:140,0
+DA:145,0
+DA:146,0
+DA:147,0
+DA:148,0
+DA:149,0
+DA:150,0
+DA:151,0
+DA:152,0
+DA:154,0
+DA:155,0
+DA:158,0
+DA:159,0
+DA:160,0
+DA:161,0
+DA:163,0
+DA:165,0
+DA:166,0
+DA:167,0
+DA:168,0
+DA:170,0
+DA:171,0
+DA:172,0
+DA:174,0
+DA:187,0
+DA:188,0
+DA:189,0
+DA:201,0
+DA:202,0
+DA:203,0
+DA:218,0
+DA:219,0
+DA:220,0
+DA:236,0
+DA:241,0
+DA:242,0
+DA:244,0
+DA:245,0
+DA:246,0
+DA:247,0
+DA:250,0
+DA:251,0
+DA:254,0
+DA:255,0
+DA:257,0
+DA:258,0
+DA:260,0
+DA:262,0
+DA:265,0
+DA:266,0
+DA:267,0
+DA:268,0
+DA:271,0
+DA:272,0
+DA:275,0
+DA:276,0
+DA:278,0
+DA:279,0
+DA:281,0
+DA:283,0
+DA:286,0
+DA:301,0
+DA:302,0
+DA:303,0
+DA:319,0
+DA:324,0
+DA:325,0
+DA:327,0
+DA:328,0
+DA:329,0
+DA:330,0
+DA:333,0
+DA:334,0
+DA:337,0
+DA:338,0
+DA:340,0
+DA:341,0
+DA:343,0
+DA:345,0
+DA:348,0
+DA:349,0
+DA:350,0
+DA:351,0
+DA:354,0
+DA:355,0
+DA:358,0
+DA:359,0
+DA:361,0
+DA:362,0
+DA:364,0
+DA:366,0
+DA:369,0
+DA:387,0
+DA:388,0
+DA:390,0
+DA:393,0
+DA:394,0
+DA:396,0
+DA:397,0
+DA:399,0
+DA:402,0
+DA:403,0
+DA:407,0
+DA:408,0
+DA:409,0
+DA:412,0
+DA:415,0
+DA:417,0
+DA:418,0
+DA:420,0
+DA:425,0
+DA:426,0
+DA:427,0
+DA:428,0
+DA:430,0
+DA:431,0
+DA:450,0
+DA:452,0
+DA:454,0
+DA:456,0
+DA:457,0
+DA:460,0
+DA:461,0
+DA:463,0
+DA:464,0
+DA:466,0
+DA:469,0
+DA:470,0
+DA:474,0
+DA:475,0
+DA:476,0
+DA:479,0
+DA:482,0
+DA:484,0
+DA:485,0
+DA:486,0
+DA:487,0
+DA:489,0
+DA:494,0
+DA:495,0
+DA:496,0
+DA:497,0
+DA:499,0
+DA:517,0
+DA:518,0
+DA:520,0
+DA:523,0
+DA:524,0
+DA:526,0
+DA:527,0
+DA:529,0
+DA:532,0
+DA:533,0
+DA:537,0
+DA:538,0
+DA:539,0
+DA:542,0
+DA:545,0
+DA:547,0
+DA:548,0
+DA:549,0
+DA:550,0
+DA:552,0
+DA:557,0
+DA:558,0
+DA:559,0
+DA:560,0
+DA:562,0
+DA:583,0
+DA:586,0
+DA:587,0
+DA:588,0
+DA:589,0
+DA:604,0
+DA:608,0
+DA:609,0
+DA:610,0
+DA:614,0
+DA:615,0
+DA:616,0
+DA:617,0
+DA:619,0
+DA:621,0
+DA:625,0
+DA:626,0
+DA:627,0
+DA:628,0
+DA:630,0
+DA:632,0
+DA:645,1152
+DA:646,1152
+DA:647,1152
+DA:660,384
+DA:661,384
+DA:662,384
+DA:675,4224
+DA:678,4224
+DA:679,4224
+DA:680,4224
+DA:681,4224
+DA:682,4224
+DA:695,10368
+DA:696,10368
+DA:697,10368
+DA:710,1152
+DA:711,1152
+DA:712,1152
+DA:725,384
+DA:726,384
+DA:727,384
+DA:739,0
+DA:740,0
+DA:741,0
+DA:753,384
+DA:754,384
+DA:755,384
+DA:781,576
+DA:782,576
+DA:785,192
+DA:786,192
+DA:787,192
+DA:788,192
+DA:789,192
+DA:792,384
+DA:794,384
+DA:795,384
+DA:796,384
+DA:797,384
+DA:798,384
+DA:799,384
+DA:800,384
+DA:801,384
+DA:802,384
+DA:803,384
+DA:804,384
+DA:823,0
+DA:824,0
+DA:827,0
+DA:828,0
+DA:829,0
+DA:830,0
+DA:831,0
+DA:834,0
+DA:836,0
+DA:837,0
+DA:838,0
+DA:839,0
+DA:840,0
+DA:841,0
+DA:842,0
+DA:843,0
+DA:844,0
+DA:845,0
+DA:846,0
+DA:847,0
+DA:860,0
+DA:861,0
+DA:864,0
+DA:865,0
+DA:866,0
+DA:867,0
+DA:868,0
+DA:872,0
+DA:873,0
+DA:874,0
+DA:875,0
+DA:876,0
+DA:877,0
+DA:879,0
+DA:880,0
+DA:898,0
+DA:899,0
+DA:900,0
+DA:901,0
+DA:902,0
+DA:928,1152
+DA:929,1152
+DA:930,1152
+DA:935,1152
+DA:936,1152
+DA:937,1152
+DA:938,1152
+DA:939,1152
+DA:940,1152
+DA:941,1152
+DA:942,1152
+DA:944,1152
+DA:945,1152
+DA:950,1152
+DA:951,1152
+DA:952,0
+DA:954,0
+DA:956,0
+DA:957,0
+DA:961,0
+DA:962,0
+DA:968,1152
+DA:969,768
+DA:970,1152
+DA:975,1152
+DA:976,1152
+DA:977,1152
+DA:978,2304
+DA:981,1920
+DA:982,1920
+DA:983,768
+DA:984,768
+DA:985,768
+DA:988,384
+DA:989,384
+DA:992,384
+DA:993,384
+DA:994,384
+DA:995,384
+DA:996,384
+DA:1002,1152
+DA:1003,1152
+DA:1004,1152
+DA:1006,1152
+DA:1009,768
+DA:1010,768
+DA:1011,384
+DA:1012,384
+DA:1015,384
+DA:1016,384
+DA:1021,1152
+DA:1022,1152
+DA:1023,1152
+DA:1024,384
+DA:1025,384
+DA:1027,1152
+DA:1028,1152
+DA:1029,1152
+DA:1030,1152
+DA:1031,1152
+DA:1049,1920
+DA:1054,1920
+DA:1055,1920
+DA:1056,1920
+DA:1058,1920
+DA:1059,1920
+DA:1060,1920
+DA:1061,1920
+DA:1062,1920
+DA:1063,1920
+DA:1079,0
+DA:1082,0
+DA:1083,0
+DA:1084,0
+DA:1085,0
+DA:1087,0
+DA:1088,0
+DA:1092,0
+DA:1093,0
+DA:1097,0
+DA:1098,0
+DA:1099,0
+DA:1112,1536
+DA:1113,1536
+DA:1114,1536
+DA:1130,0
+DA:1133,0
+DA:1134,0
+DA:1135,0
+DA:1137,0
+DA:1138,0
+DA:1139,0
+DA:1141,0
+DA:1142,0
+DA:1143,0
+DA:1145,0
+DA:1163,0
+DA:1168,0
+DA:1169,0
+DA:1170,0
+DA:1171,0
+DA:1172,0
+DA:1173,0
+DA:1174,0
+DA:1177,0
+DA:1178,0
+DA:1179,0
+DA:1180,0
+DA:1181,0
+DA:1184,0
+DA:1197,0
+DA:1198,0
+DA:1199,0
+DA:1214,10416
+DA:1215,10416
+DA:1216,10416
+DA:1217,10416
+DA:1232,5376
+DA:1233,5376
+DA:1234,5376
+DA:1235,5376
+DA:1250,7776
+DA:1251,7776
+DA:1252,7776
+DA:1253,7776
+DA:1273,0
+DA:1274,0
+DA:1275,0
+DA:1276,0
+DA:1277,0
+DA:1278,0
+DA:1291,0
+DA:1294,0
+DA:1295,0
+DA:1297,0
+DA:1298,0
+DA:1303,0
+DA:1304,0
+DA:1305,0
+DA:1307,0
+DA:1315,0
+DA:1316,0
+DA:1317,0
+DA:1318,0
+DA:1331,0
+DA:1334,0
+DA:1335,0
+DA:1336,0
+DA:1337,0
+DA:1350,0
+DA:1353,0
+DA:1354,0
+DA:1355,0
+DA:1356,0
+DA:1374,0
+DA:1379,0
+DA:1380,0
+DA:1381,0
+DA:1382,0
+DA:1383,0
+DA:1384,0
+DA:1385,0
+DA:1386,0
+DA:1387,0
+DA:1390,0
+DA:1402,3072
+DA:1404,3072
+DA:1406,3072
+DA:1408,384
+DA:1409,384
+DA:1410,384
+DA:1411,384
+DA:1414,0
+DA:1415,0
+DA:1417,0
+DA:1418,0
+DA:1422,0
+DA:1423,0
+DA:1424,0
+DA:1429,0
+DA:1434,384
+DA:1435,384
+DA:1436,384
+DA:1438,384
+DA:1439,384
+DA:1444,384
+DA:1445,384
+DA:1446,384
+DA:1447,384
+DA:1448,384
+DA:1449,384
+DA:1465,0
+DA:1468,0
+DA:1469,0
+DA:1471,0
+DA:1472,0
+DA:1476,0
+DA:1477,0
+DA:1478,0
+DA:1481,0
+DA:1482,0
+DA:1483,0
+DA:1485,0
+DA:1493,0
+DA:1509,0
+DA:1512,0
+DA:1513,0
+DA:1514,0
+DA:1515,0
+DA:1520,0
+DA:1521,0
+DA:1523,0
+DA:1536,0
+DA:1539,0
+DA:1540,0
+DA:1544,0
+DA:1545,0
+DA:1546,0
+DA:1547,0
+DA:1552,0
+DA:1553,0
+DA:1555,0
+DA:1568,0
+DA:1570,0
+DA:1572,0
+DA:1573,0
+DA:1574,0
+DA:1575,0
+DA:1576,0
+DA:1578,0
+DA:1591,0
+DA:1592,0
+DA:1593,0
+DA:1594,0
+DA:1597,0
+DA:1598,0
+DA:1599,0
+DA:1601,0
+DA:1615,0
+DA:1617,0
+DA:1618,0
+DA:1619,0
+DA:1621,0
+DA:1622,0
+DA:1623,0
+DA:1624,0
+DA:1626,0
+DA:1627,0
+DA:1628,0
+DA:1629,0
+DA:1633,0
+DA:1634,0
+DA:1635,0
+DA:1636,0
+DA:1637,0
+DA:1638,0
+DA:1639,0
+DA:1640,0
+DA:1641,0
+DA:1642,0
+DA:1643,0
+DA:1646,0
+DA:1647,0
+DA:1648,0
+DA:1650,0
+DA:1651,0
+DA:1664,0
+DA:1665,0
+DA:1666,0
+DA:1679,0
+DA:1680,0
+DA:1681,0
+DA:1694,0
+DA:1695,0
+DA:1696,0
+DA:1709,0
+DA:1710,0
+DA:1711,0
+DA:1724,1536
+DA:1725,1536
+DA:1726,1536
+DA:1738,1152
+DA:1750,384
+DA:1753,384
+DA:1754,384
+DA:1755,384
+DA:1756,384
+DA:1769,768
+DA:1772,768
+DA:1773,768
+DA:1774,768
+DA:1775,768
+DA:1786,0
+DA:1789,0
+DA:1791,0
+DA:1792,0
+DA:1793,0
+DA:1796,0
+DA:1798,0
+DA:1801,0
+DA:1804,0
+DA:1805,0
+DA:1809,0
+DA:1814,0
+DA:1815,0
+DA:1816,0
+DA:1817,0
+DA:1819,0
+DA:1821,0
+DA:1822,0
+DA:1823,0
+DA:1825,0
+DA:1827,0
+DA:1828,0
+DA:1829,0
+DA:1831,0
+DA:1832,0
+DA:1834,0
+DA:1836,0
+DA:1838,0
+DA:1839,0
+DA:1840,0
+DA:1841,0
+DA:1844,0
+DA:1847,0
+DA:1848,0
+DA:1854,0
+DA:1855,0
+DA:1856,0
+DA:1857,0
+DA:1858,0
+DA:1859,0
+DA:1860,0
+DA:1861,0
+DA:1862,0
+DA:1864,0
+DA:1872,0
+DA:1873,0
+DA:1874,0
+DA:1875,0
+DA:1876,0
+DA:1877,0
+DA:1879,0
+DA:1883,0
+DA:1886,0
+DA:1887,0
+DA:1893,0
+DA:1894,0
+DA:1895,0
+DA:1896,0
+DA:1897,0
+DA:1898,0
+DA:1899,0
+DA:1900,0
+DA:1901,0
+DA:1903,0
+DA:1906,0
+DA:1924,0
+DA:1929,0
+DA:1930,0
+DA:1931,0
+DA:1932,0
+DA:1933,0
+DA:1934,0
+DA:1935,0
+DA:1936,0
+DA:1952,0
+DA:1953,0
+DA:1955,0
+DA:1957,0
+DA:1960,0
+DA:1961,0
+DA:1962,0
+DA:1963,0
+DA:1972,0
+DA:1973,0
+DA:1974,0
+DA:1975,0
+DA:1976,0
+DA:1978,0
+DA:1979,0
+DA:1982,0
+DA:1983,0
+DA:1984,0
+DA:1985,0
+DA:1986,0
+DA:1987,0
+DA:1988,0
+DA:1995,0
+DA:1997,0
+DA:2005,0
+DA:2011,0
+DA:2012,0
+DA:2025,0
+DA:2026,0
+DA:2027,0
+DA:2028,0
+DA:2029,0
+DA:2031,0
+DA:2036,0
+DA:2037,0
+DA:2040,0
+DA:2041,0
+DA:2042,0
+DA:2043,0
+DA:2044,0
+DA:2045,0
+DA:2047,0
+DA:2048,0
+DA:2050,0
+DA:2066,0
+DA:2067,0
+DA:2084,0
+DA:2085,0
+DA:2099,0
+DA:2100,0
+DA:2116,0
+DA:2117,0
+DA:2134,0
+DA:2135,0
+DA:2149,0
+DA:2150,0
+DA:2166,0
+DA:2167,0
+DA:2184,0
+DA:2185,0
+DA:2199,0
+DA:2200,0
+DA:2220,384
+DA:2223,384
+DA:2225,384
+DA:2226,384
+DA:2228,0
+DA:2229,384
+DA:2231,0
+DA:2235,384
+DA:2238,384
+DA:2240,384
+DA:2262,384
+DA:2265,384
+DA:2267,384
+DA:2268,384
+DA:2270,0
+DA:2271,0
+DA:2276,0
+DA:2277,0
+DA:2278,0
+DA:2279,0
+DA:2282,384
+DA:2284,384
+DA:2286,384
+DA:2306,384
+DA:2309,384
+DA:2311,384
+DA:2312,384
+DA:2317,0
+DA:2318,0
+DA:2321,0
+DA:2325,0
+DA:2326,0
+DA:2329,0
+DA:2330,0
+DA:2331,0
+DA:2335,0
+DA:2341,384
+DA:2343,768
+DA:2346,384
+DA:2347,384
+DA:2348,384
+DA:2351,384
+DA:2353,384
+DA:2365,384
+DA:2368,384
+DA:2369,384
+DA:2370,384
+DA:2371,384
+DA:2375,0
+DA:2376,0
+DA:2377,0
+DA:2378,0
+DA:2379,0
+DA:2382,384
+DA:2394,768
+DA:2395,768
+DA:2396,384
+DA:2397,384
+DA:2400,384
+DA:2401,384
+DA:2404,1536
+DA:2405,1152
+DA:2406,768
+DA:2407,576
+DA:2409,768
+DA:2410,576
+DA:2412,768
+DA:2413,192
+DA:2415,768
+DA:2416,768
+DA:2419,1536
+DA:2420,1152
+DA:2421,384
+DA:2422,384
+DA:2423,192
+DA:2425,384
+DA:2426,0
+DA:2428,384
+DA:2429,384
+DA:2432,384
+DA:2433,384
+DA:2435,384
+DA:2436,384
+DA:2437,384
+DA:2439,384
+DA:2441,384
+DA:2442,0
+DA:2443,0
+DA:2446,384
+DA:2447,384
+DA:2448,384
+DA:2449,384
+DA:2451,384
+DA:2452,0
+DA:2453,0
+DA:2454,0
+DA:2457,384
+DA:2460,384
+DA:2462,384
+DA:2463,384
+DA:2464,384
+DA:2465,384
+LF:841
+LH:250
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-preconditioning.c
+FNL:0,1011,1215
+FNA:0,0,CeedOperatorMultigridLevelCreateSingle_Core
+FNL:1,105,202
+FNA:1,0,CeedOperatorCreateFallback
+FNL:2,1234,1255
+FNA:2,0,CeedBuildMassLaplace
+FNL:3,1276,1298
+FNA:3,0,CeedOperatorGetBasisPointer
+FNL:4,1310,1340
+FNA:4,0,CeedOperatorCreateActivePointBlockRestriction
+FNL:5,1352,1361
+FNA:5,0,CeedOperatorGetQFunctionAssemblyData
+FNL:6,1373,1378
+FNA:6,0,CeedQFunctionAssemblyDataCreate
+FNL:7,1389,1392
+FNA:7,0,CeedQFunctionAssemblyDataReference
+FNL:8,1404,1408
+FNA:8,0,CeedQFunctionAssemblyDataSetReuse
+FNL:9,1420,1423
+FNA:9,0,CeedQFunctionAssemblyDataSetUpdateNeeded
+FNL:10,1435,1438
+FNA:10,0,CeedQFunctionAssemblyDataIsUpdateNeeded
+FNL:11,1455,1460
+FNA:11,0,CeedQFunctionAssemblyDataReferenceCopy
+FNL:12,1472,1475
+FNA:12,0,CeedQFunctionAssemblyDataIsSetup
+FNL:13,1488,1494
+FNA:13,0,CeedQFunctionAssemblyDataSetObjects
+FNL:14,1507,1513
+FNA:14,0,CeedQFunctionAssemblyDataGetObjects
+FNL:15,1524,1535
+FNA:15,384,CeedQFunctionAssemblyDataDestroy
+FNL:16,1547,1556
+FNA:16,0,CeedOperatorGetOperatorAssemblyData
+FNL:17,1576,1721
+FNA:17,0,CeedOperatorAssemblyDataCreate
+FNL:18,1744,1758
+FNA:18,0,CeedOperatorAssemblyDataGetEvalModes
+FNL:19,1777,1885
+FNA:19,0,CeedOperatorAssemblyDataGetBases
+FNL:20,1902,1910
+FNA:20,0,CeedOperatorAssemblyDataGetElemRestrictions
+FNL:21,1921,1956
+FNA:21,384,CeedOperatorAssemblyDataDestroy
+FNL:22,1968,1995
+FNA:22,0,CeedOperatorGetFallback
+FNL:23,2007,2010
+FNA:23,0,CeedOperatorGetFallbackParent
+FNL:24,2022,2027
+FNA:24,0,CeedOperatorGetFallbackParentCeed
+FNL:25,2057,2073
+FNA:25,0,CeedOperatorLinearAssembleQFunction
+FNL:26,2094,2096
+FNA:26,0,CeedOperatorLinearAssembleQFunctionBuildOrUpdate
+FNL:27,2115,2162
+FNA:27,0,CeedOperatorLinearAssembleDiagonal
+FNL:28,216,387
+FNA:28,0,CeedOperatorLinearAssembleAddDiagonalSingle_Mesh
+FNL:29,2181,2221
+FNA:29,0,CeedOperatorLinearAssembleAddDiagonal
+FNL:30,2243,2325
+FNA:30,0,CeedOperatorLinearAssemblePointBlockDiagonalSymbolic
+FNL:31,2346,2388
+FNA:31,0,CeedOperatorLinearAssemblePointBlockDiagonal
+FNL:32,2409,2449
+FNA:32,0,CeedOperatorLinearAssembleAddPointBlockDiagonal
+FNL:33,2471,2516
+FNA:33,0,CeedOperatorLinearAssembleSymbolic
+FNL:34,2536,2588
+FNA:34,0,CeedOperatorLinearAssemble
+FNL:35,2604,2662
+FNA:35,0,CeedOperatorCompositeGetMultiplicity
+FNL:36,2681,2700
+FNA:36,0,CeedOperatorMultigridLevelCreate
+FNL:37,2720,2767
+FNA:37,0,CeedOperatorMultigridLevelCreateTensorH1
+FNL:38,2787,2832
+FNA:38,0,CeedOperatorMultigridLevelCreateH1
+FNL:39,2853,3076
+FNA:39,0,CeedOperatorCreateFDMElementInverse
+FNL:40,37,94
+FNA:40,0,CeedQFunctionCreateFallback
+FNL:41,401,409
+FNA:41,0,CeedOperatorLinearAssembleAddDiagonalSingle
+FNL:42,423,438
+FNA:42,0,CeedOperatorLinearAssembleAddDiagonalComposite
+FNL:43,454,552
+FNA:43,0,CeedOperatorAssembleSymbolicSingle
+FNL:44,574,632
+FNA:44,0,CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core
+FNL:45,653,656
+FNA:45,0,CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback
+FNL:46,671,916
+FNA:46,0,CeedOperatorAssembleSingle
+FNL:47,928,957
+FNA:47,0,CeedOperatorAssemblyCountEntriesSingle
+FNL:48,969,993
+FNA:48,0,CeedOperatorLinearAssembleGetNumEntries
+FNF:49
+FNH:2
+DA:37,0
+DA:38,0
+DA:43,0
+DA:45,0
+DA:47,0
+DA:48,0
+DA:50,0
+DA:51,0
+DA:52,0
+DA:53,0
+DA:54,0
+DA:55,0
+DA:57,0
+DA:64,0
+DA:65,0
+DA:66,0
+DA:71,0
+DA:72,0
+DA:73,0
+DA:75,0
+DA:76,0
+DA:81,0
+DA:82,0
+DA:84,0
+DA:89,0
+DA:90,0
+DA:92,0
+DA:93,0
+DA:105,0
+DA:111,0
+DA:114,0
+DA:115,0
+DA:116,0
+DA:117,0
+DA:119,0
+DA:122,0
+DA:123,0
+DA:127,0
+DA:128,0
+DA:129,0
+DA:130,0
+DA:133,0
+DA:134,0
+DA:137,0
+DA:139,0
+DA:142,0
+DA:143,0
+DA:144,0
+DA:145,0
+DA:146,0
+DA:150,0
+DA:151,0
+DA:152,0
+DA:153,0
+DA:154,0
+DA:156,0
+DA:158,0
+DA:159,0
+DA:165,0
+DA:166,0
+DA:167,0
+DA:168,0
+DA:169,0
+DA:171,0
+DA:177,0
+DA:178,0
+DA:179,0
+DA:180,0
+DA:181,0
+DA:186,0
+DA:187,0
+DA:190,0
+DA:191,0
+DA:192,0
+DA:194,0
+DA:195,0
+DA:198,0
+DA:199,0
+DA:200,0
+DA:201,0
+DA:216,0
+DA:220,0
+DA:221,0
+DA:226,0
+DA:227,0
+DA:229,0
+DA:230,0
+DA:231,0
+DA:232,0
+DA:242,0
+DA:243,0
+DA:246,0
+DA:247,0
+DA:250,0
+DA:252,0
+DA:253,0
+DA:257,0
+DA:258,0
+DA:259,0
+DA:260,0
+DA:261,0
+DA:264,0
+DA:265,0
+DA:268,0
+DA:269,0
+DA:270,0
+DA:271,0
+DA:274,0
+DA:275,0
+DA:278,0
+DA:282,0
+DA:283,0
+DA:285,0
+DA:289,0
+DA:292,0
+DA:293,0
+DA:294,0
+DA:295,0
+DA:296,0
+DA:297,0
+DA:298,0
+DA:301,0
+DA:302,0
+DA:304,0
+DA:305,0
+DA:307,0
+DA:308,0
+DA:309,0
+DA:314,0
+DA:316,0
+DA:317,0
+DA:319,0
+DA:320,0
+DA:321,0
+DA:322,0
+DA:324,0
+DA:325,0
+DA:326,0
+DA:327,0
+DA:328,0
+DA:330,0
+DA:332,0
+DA:333,0
+DA:335,0
+DA:336,0
+DA:337,0
+DA:338,0
+DA:339,0
+DA:341,0
+DA:344,0
+DA:346,0
+DA:347,0
+DA:349,0
+DA:350,0
+DA:351,0
+DA:352,0
+DA:354,0
+DA:355,0
+DA:356,0
+DA:361,0
+DA:362,0
+DA:363,0
+DA:365,0
+DA:366,0
+DA:374,0
+DA:377,0
+DA:380,0
+DA:381,0
+DA:382,0
+DA:384,0
+DA:385,0
+DA:386,0
+DA:401,0
+DA:405,0
+DA:406,0
+DA:407,0
+DA:408,0
+DA:423,0
+DA:428,0
+DA:429,0
+DA:430,0
+DA:431,0
+DA:432,0
+DA:434,0
+DA:437,0
+DA:454,0
+DA:457,0
+DA:465,0
+DA:466,0
+DA:467,0
+DA:469,0
+DA:470,0
+DA:471,0
+DA:472,0
+DA:473,0
+DA:474,0
+DA:477,0
+DA:478,0
+DA:479,0
+DA:480,0
+DA:481,0
+DA:482,0
+DA:483,0
+DA:484,0
+DA:485,0
+DA:486,0
+DA:487,0
+DA:489,0
+DA:490,0
+DA:491,0
+DA:495,0
+DA:496,0
+DA:497,0
+DA:500,0
+DA:501,0
+DA:502,0
+DA:503,0
+DA:504,0
+DA:505,0
+DA:506,0
+DA:507,0
+DA:508,0
+DA:509,0
+DA:510,0
+DA:512,0
+DA:513,0
+DA:514,0
+DA:515,0
+DA:516,0
+DA:517,0
+DA:518,0
+DA:520,0
+DA:523,0
+DA:524,0
+DA:525,0
+DA:526,0
+DA:527,0
+DA:528,0
+DA:529,0
+DA:530,0
+DA:531,0
+DA:533,0
+DA:534,0
+DA:535,0
+DA:541,0
+DA:542,0
+DA:543,0
+DA:544,0
+DA:545,0
+DA:546,0
+DA:548,0
+DA:549,0
+DA:550,0
+DA:551,0
+DA:574,0
+DA:576,0
+DA:577,0
+DA:578,0
+DA:580,0
+DA:583,0
+DA:584,0
+DA:586,0
+DA:587,0
+DA:588,0
+DA:589,0
+DA:591,0
+DA:592,0
+DA:596,0
+DA:600,0
+DA:601,0
+DA:603,0
+DA:604,0
+DA:605,0
+DA:608,0
+DA:609,0
+DA:610,0
+DA:612,0
+DA:613,0
+DA:615,0
+DA:618,0
+DA:619,0
+DA:620,0
+DA:621,0
+DA:626,0
+DA:627,0
+DA:628,0
+DA:629,0
+DA:631,0
+DA:653,0
+DA:655,0
+DA:671,0
+DA:674,0
+DA:675,0
+DA:679,0
+DA:681,0
+DA:682,0
+DA:685,0
+DA:687,0
+DA:688,0
+DA:693,0
+DA:694,0
+DA:695,0
+DA:696,0
+DA:697,0
+DA:701,0
+DA:702,0
+DA:708,0
+DA:709,0
+DA:711,0
+DA:712,0
+DA:713,0
+DA:714,0
+DA:719,0
+DA:726,0
+DA:727,0
+DA:730,0
+DA:731,0
+DA:734,0
+DA:736,0
+DA:739,0
+DA:740,0
+DA:741,0
+DA:742,0
+DA:743,0
+DA:744,0
+DA:746,0
+DA:747,0
+DA:748,0
+DA:749,0
+DA:750,0
+DA:752,0
+DA:753,0
+DA:754,0
+DA:755,0
+DA:756,0
+DA:759,0
+DA:760,0
+DA:761,0
+DA:765,0
+DA:766,0
+DA:767,0
+DA:768,0
+DA:769,0
+DA:774,0
+DA:775,0
+DA:776,0
+DA:777,0
+DA:778,0
+DA:781,0
+DA:782,0
+DA:783,0
+DA:784,0
+DA:786,0
+DA:787,0
+DA:789,0
+DA:794,0
+DA:796,0
+DA:797,0
+DA:798,0
+DA:799,0
+DA:801,0
+DA:802,0
+DA:803,0
+DA:804,0
+DA:806,0
+DA:807,0
+DA:808,0
+DA:809,0
+DA:810,0
+DA:812,0
+DA:813,0
+DA:814,0
+DA:815,0
+DA:817,0
+DA:819,0
+DA:825,0
+DA:826,0
+DA:831,0
+DA:832,0
+DA:833,0
+DA:837,0
+DA:838,0
+DA:840,0
+DA:841,0
+DA:843,0
+DA:844,0
+DA:847,0
+DA:848,0
+DA:851,0
+DA:852,0
+DA:853,0
+DA:854,0
+DA:855,0
+DA:856,0
+DA:860,0
+DA:861,0
+DA:863,0
+DA:864,0
+DA:865,0
+DA:868,0
+DA:869,0
+DA:872,0
+DA:873,0
+DA:874,0
+DA:875,0
+DA:876,0
+DA:877,0
+DA:883,0
+DA:884,0
+DA:885,0
+DA:886,0
+DA:892,0
+DA:893,0
+DA:896,0
+DA:897,0
+DA:898,0
+DA:899,0
+DA:900,0
+DA:901,0
+DA:902,0
+DA:904,0
+DA:905,0
+DA:906,0
+DA:907,0
+DA:908,0
+DA:911,0
+DA:912,0
+DA:913,0
+DA:914,0
+DA:915,0
+DA:928,0
+DA:933,0
+DA:934,0
+DA:936,0
+DA:937,0
+DA:938,0
+DA:939,0
+DA:940,0
+DA:941,0
+DA:942,0
+DA:946,0
+DA:947,0
+DA:949,0
+DA:950,0
+DA:951,0
+DA:953,0
+DA:954,0
+DA:955,0
+DA:956,0
+DA:969,0
+DA:972,0
+DA:973,0
+DA:975,0
+DA:979,0
+DA:980,0
+DA:982,0
+DA:983,0
+DA:986,0
+DA:987,0
+DA:990,0
+DA:992,0
+DA:1011,0
+DA:1017,0
+DA:1018,0
+DA:1021,0
+DA:1024,0
+DA:1025,0
+DA:1031,0
+DA:1032,0
+DA:1036,0
+DA:1037,0
+DA:1038,0
+DA:1039,0
+DA:1040,0
+DA:1042,0
+DA:1045,0
+DA:1047,0
+DA:1050,0
+DA:1051,0
+DA:1053,0
+DA:1054,0
+DA:1055,0
+DA:1056,0
+DA:1057,0
+DA:1058,0
+DA:1060,0
+DA:1061,0
+DA:1063,0
+DA:1064,0
+DA:1065,0
+DA:1066,0
+DA:1069,0
+DA:1072,0
+DA:1073,0
+DA:1075,0
+DA:1076,0
+DA:1077,0
+DA:1078,0
+DA:1079,0
+DA:1080,0
+DA:1082,0
+DA:1083,0
+DA:1085,0
+DA:1086,0
+DA:1087,0
+DA:1088,0
+DA:1094,0
+DA:1095,0
+DA:1099,0
+DA:1103,0
+DA:1104,0
+DA:1106,0
+DA:1107,0
+DA:1108,0
+DA:1109,0
+DA:1110,0
+DA:1111,0
+DA:1112,0
+DA:1113,0
+DA:1114,0
+DA:1118,0
+DA:1119,0
+DA:1120,0
+DA:1123,0
+DA:1127,0
+DA:1130,0
+DA:1135,0
+DA:1136,0
+DA:1137,0
+DA:1138,0
+DA:1139,0
+DA:1140,0
+DA:1141,0
+DA:1142,0
+DA:1143,0
+DA:1144,0
+DA:1145,0
+DA:1147,0
+DA:1148,0
+DA:1149,0
+DA:1150,0
+DA:1155,0
+DA:1156,0
+DA:1157,0
+DA:1158,0
+DA:1161,0
+DA:1164,0
+DA:1168,0
+DA:1173,0
+DA:1174,0
+DA:1175,0
+DA:1176,0
+DA:1177,0
+DA:1178,0
+DA:1179,0
+DA:1180,0
+DA:1181,0
+DA:1182,0
+DA:1183,0
+DA:1185,0
+DA:1186,0
+DA:1187,0
+DA:1188,0
+DA:1193,0
+DA:1194,0
+DA:1195,0
+DA:1196,0
+DA:1199,0
+DA:1202,0
+DA:1206,0
+DA:1209,0
+DA:1210,0
+DA:1211,0
+DA:1212,0
+DA:1213,0
+DA:1214,0
+DA:1234,0
+DA:1236,0
+DA:1237,0
+DA:1238,0
+DA:1239,0
+DA:1240,0
+DA:1244,0
+DA:1245,0
+DA:1246,0
+DA:1248,0
+DA:1249,0
+DA:1252,0
+DA:1253,0
+DA:1254,0
+DA:1276,0
+DA:1277,0
+DA:1278,0
+DA:1279,0
+DA:1280,0
+DA:1281,0
+DA:1282,0
+DA:1283,0
+DA:1284,0
+DA:1285,0
+DA:1286,0
+DA:1287,0
+DA:1288,0
+DA:1289,0
+DA:1290,0
+DA:1291,0
+DA:1292,0
+DA:1293,0
+DA:1294,0
+DA:1296,0
+DA:1297,0
+DA:1310,0
+DA:1316,0
+DA:1317,0
+DA:1320,0
+DA:1321,0
+DA:1322,0
+DA:1323,0
+DA:1324,0
+DA:1325,0
+DA:1326,0
+DA:1327,0
+DA:1328,0
+DA:1329,0
+DA:1333,0
+DA:1337,0
+DA:1338,0
+DA:1339,0
+DA:1352,0
+DA:1353,0
+DA:1356,0
+DA:1357,0
+DA:1359,0
+DA:1360,0
+DA:1373,0
+DA:1374,0
+DA:1375,0
+DA:1376,0
+DA:1377,0
+DA:1389,0
+DA:1390,0
+DA:1391,0
+DA:1404,0
+DA:1405,0
+DA:1406,0
+DA:1407,0
+DA:1420,0
+DA:1421,0
+DA:1422,0
+DA:1435,0
+DA:1436,0
+DA:1437,0
+DA:1455,0
+DA:1456,0
+DA:1457,0
+DA:1458,0
+DA:1459,0
+DA:1472,0
+DA:1473,0
+DA:1474,0
+DA:1488,0
+DA:1489,0
+DA:1490,0
+DA:1492,0
+DA:1493,0
+DA:1507,0
+DA:1508,0
+DA:1510,0
+DA:1511,0
+DA:1512,0
+DA:1524,384
+DA:1525,384
+DA:1526,384
+DA:1527,384
+DA:1529,0
+DA:1530,0
+DA:1531,0
+DA:1533,0
+DA:1534,0
+DA:1547,0
+DA:1548,0
+DA:1551,0
+DA:1552,0
+DA:1554,0
+DA:1555,0
+DA:1576,0
+DA:1577,0
+DA:1578,0
+DA:1579,0
+DA:1580,0
+DA:1586,0
+DA:1587,0
+DA:1590,0
+DA:1591,0
+DA:1594,0
+DA:1597,0
+DA:1598,0
+DA:1599,0
+DA:1602,0
+DA:1603,0
+DA:1604,0
+DA:1606,0
+DA:1608,0
+DA:1609,0
+DA:1610,0
+DA:1611,0
+DA:1612,0
+DA:1613,0
+DA:1615,0
+DA:1618,0
+DA:1619,0
+DA:1620,0
+DA:1621,0
+DA:1622,0
+DA:1623,0
+DA:1624,0
+DA:1625,0
+DA:1626,0
+DA:1627,0
+DA:1628,0
+DA:1629,0
+DA:1630,0
+DA:1631,0
+DA:1632,0
+DA:1633,0
+DA:1634,0
+DA:1635,0
+DA:1637,0
+DA:1639,0
+DA:1640,0
+DA:1641,0
+DA:1642,0
+DA:1643,0
+DA:1644,0
+DA:1646,0
+DA:1648,0
+DA:1650,0
+DA:1654,0
+DA:1655,0
+DA:1656,0
+DA:1657,0
+DA:1660,0
+DA:1661,0
+DA:1662,0
+DA:1664,0
+DA:1666,0
+DA:1667,0
+DA:1668,0
+DA:1669,0
+DA:1670,0
+DA:1671,0
+DA:1673,0
+DA:1676,0
+DA:1677,0
+DA:1678,0
+DA:1679,0
+DA:1680,0
+DA:1681,0
+DA:1682,0
+DA:1683,0
+DA:1684,0
+DA:1685,0
+DA:1686,0
+DA:1687,0
+DA:1688,0
+DA:1689,0
+DA:1690,0
+DA:1691,0
+DA:1692,0
+DA:1693,0
+DA:1695,0
+DA:1697,0
+DA:1698,0
+DA:1699,0
+DA:1700,0
+DA:1701,0
+DA:1702,0
+DA:1704,0
+DA:1706,0
+DA:1708,0
+DA:1710,0
+DA:1711,0
+DA:1712,0
+DA:1713,0
+DA:1714,0
+DA:1715,0
+DA:1716,0
+DA:1717,0
+DA:1718,0
+DA:1719,0
+DA:1720,0
+DA:1744,0
+DA:1748,0
+DA:1749,0
+DA:1750,0
+DA:1751,0
+DA:1752,0
+DA:1753,0
+DA:1754,0
+DA:1755,0
+DA:1756,0
+DA:1757,0
+DA:1777,0
+DA:1781,0
+DA:1784,0
+DA:1785,0
+DA:1786,0
+DA:1787,0
+DA:1789,0
+DA:1791,0
+DA:1792,0
+DA:1794,0
+DA:1795,0
+DA:1797,0
+DA:1798,0
+DA:1799,0
+DA:1800,0
+DA:1804,0
+DA:1805,0
+DA:1806,0
+DA:1807,0
+DA:1809,0
+DA:1810,0
+DA:1811,0
+DA:1813,0
+DA:1814,0
+DA:1815,0
+DA:1816,0
+DA:1817,0
+DA:1819,0
+DA:1820,0
+DA:1824,0
+DA:1825,0
+DA:1829,0
+DA:1832,0
+DA:1833,0
+DA:1834,0
+DA:1835,0
+DA:1837,0
+DA:1839,0
+DA:1840,0
+DA:1842,0
+DA:1843,0
+DA:1845,0
+DA:1846,0
+DA:1847,0
+DA:1848,0
+DA:1852,0
+DA:1853,0
+DA:1854,0
+DA:1855,0
+DA:1857,0
+DA:1858,0
+DA:1859,0
+DA:1861,0
+DA:1862,0
+DA:1863,0
+DA:1864,0
+DA:1865,0
+DA:1867,0
+DA:1868,0
+DA:1872,0
+DA:1873,0
+DA:1878,0
+DA:1879,0
+DA:1880,0
+DA:1881,0
+DA:1882,0
+DA:1883,0
+DA:1884,0
+DA:1902,0
+DA:1905,0
+DA:1906,0
+DA:1907,0
+DA:1908,0
+DA:1909,0
+DA:1921,384
+DA:1922,384
+DA:1923,384
+DA:1924,384
+DA:1926,0
+DA:1927,0
+DA:1928,0
+DA:1929,0
+DA:1930,0
+DA:1931,0
+DA:1932,0
+DA:1934,0
+DA:1935,0
+DA:1936,0
+DA:1937,0
+DA:1938,0
+DA:1939,0
+DA:1941,0
+DA:1942,0
+DA:1943,0
+DA:1944,0
+DA:1945,0
+DA:1946,0
+DA:1947,0
+DA:1948,0
+DA:1949,0
+DA:1950,0
+DA:1951,0
+DA:1952,0
+DA:1954,0
+DA:1955,0
+DA:1968,0
+DA:1970,0
+DA:1971,0
+DA:1975,0
+DA:1976,0
+DA:1977,0
+DA:1981,0
+DA:1982,0
+DA:1983,0
+DA:1984,0
+DA:1986,0
+DA:1987,0
+DA:1989,0
+DA:1991,0
+DA:1993,0
+DA:1994,0
+DA:2007,0
+DA:2008,0
+DA:2009,0
+DA:2022,0
+DA:2023,0
+DA:2024,0
+DA:2025,0
+DA:2026,0
+DA:2057,0
+DA:2058,0
+DA:2060,0
+DA:2062,0
+DA:2067,0
+DA:2068,0
+DA:2069,0
+DA:2070,0
+DA:2072,0
+DA:2094,0
+DA:2095,0
+DA:2115,0
+DA:2117,0
+DA:2119,0
+DA:2120,0
+DA:2122,0
+DA:2123,0
+DA:2126,0
+DA:2127,0
+DA:2129,0
+DA:2130,0
+DA:2133,0
+DA:2135,0
+DA:2136,0
+DA:2137,0
+DA:2139,0
+DA:2140,0
+DA:2141,0
+DA:2142,0
+DA:2144,0
+DA:2145,0
+DA:2146,0
+DA:2151,0
+DA:2152,0
+DA:2153,0
+DA:2154,0
+DA:2155,0
+DA:2159,0
+DA:2160,0
+DA:2161,0
+DA:2181,0
+DA:2183,0
+DA:2185,0
+DA:2186,0
+DA:2188,0
+DA:2189,0
+DA:2192,0
+DA:2193,0
+DA:2195,0
+DA:2196,0
+DA:2199,0
+DA:2201,0
+DA:2202,0
+DA:2203,0
+DA:2205,0
+DA:2206,0
+DA:2211,0
+DA:2212,0
+DA:2213,0
+DA:2214,0
+DA:2215,0
+DA:2219,0
+DA:2220,0
+DA:2243,0
+DA:2248,0
+DA:2250,0
+DA:2251,0
+DA:2252,0
+DA:2254,0
+DA:2255,0
+DA:2256,0
+DA:2258,0
+DA:2259,0
+DA:2269,0
+DA:2270,0
+DA:2271,0
+DA:2272,0
+DA:2275,0
+DA:2276,0
+DA:2277,0
+DA:2278,0
+DA:2281,0
+DA:2282,0
+DA:2284,0
+DA:2285,0
+DA:2292,0
+DA:2293,0
+DA:2294,0
+DA:2296,0
+DA:2301,0
+DA:2302,0
+DA:2303,0
+DA:2304,0
+DA:2305,0
+DA:2307,0
+DA:2308,0
+DA:2310,0
+DA:2311,0
+DA:2312,0
+DA:2313,0
+DA:2314,0
+DA:2319,0
+DA:2320,0
+DA:2321,0
+DA:2322,0
+DA:2324,0
+DA:2346,0
+DA:2348,0
+DA:2350,0
+DA:2351,0
+DA:2353,0
+DA:2354,0
+DA:2357,0
+DA:2358,0
+DA:2360,0
+DA:2361,0
+DA:2364,0
+DA:2366,0
+DA:2367,0
+DA:2368,0
+DA:2370,0
+DA:2371,0
+DA:2372,0
+DA:2377,0
+DA:2378,0
+DA:2379,0
+DA:2380,0
+DA:2381,0
+DA:2385,0
+DA:2386,0
+DA:2387,0
+DA:2409,0
+DA:2411,0
+DA:2413,0
+DA:2414,0
+DA:2416,0
+DA:2417,0
+DA:2420,0
+DA:2421,0
+DA:2423,0
+DA:2424,0
+DA:2427,0
+DA:2429,0
+DA:2430,0
+DA:2435,0
+DA:2436,0
+DA:2437,0
+DA:2438,0
+DA:2439,0
+DA:2443,0
+DA:2444,0
+DA:2446,0
+DA:2448,0
+DA:2471,0
+DA:2473,0
+DA:2477,0
+DA:2478,0
+DA:2480,0
+DA:2482,0
+DA:2483,0
+DA:2488,0
+DA:2489,0
+DA:2490,0
+DA:2491,0
+DA:2492,0
+DA:2499,0
+DA:2500,0
+DA:2501,0
+DA:2504,0
+DA:2505,0
+DA:2506,0
+DA:2507,0
+DA:2508,0
+DA:2509,0
+DA:2510,0
+DA:2513,0
+DA:2515,0
+DA:2536,0
+DA:2538,0
+DA:2539,0
+DA:2542,0
+DA:2543,0
+DA:2546,0
+DA:2547,0
+DA:2549,0
+DA:2550,0
+DA:2553,0
+DA:2555,0
+DA:2556,0
+DA:2557,0
+DA:2559,0
+DA:2560,0
+DA:2561,0
+DA:2562,0
+DA:2563,0
+DA:2564,0
+DA:2565,0
+DA:2567,0
+DA:2568,0
+DA:2569,0
+DA:2570,0
+DA:2571,0
+DA:2576,0
+DA:2577,0
+DA:2578,0
+DA:2579,0
+DA:2580,0
+DA:2585,0
+DA:2586,0
+DA:2587,0
+DA:2604,0
+DA:2613,0
+DA:2616,0
+DA:2619,0
+DA:2620,0
+DA:2621,0
+DA:2624,0
+DA:2625,0
+DA:2626,0
+DA:2627,0
+DA:2628,0
+DA:2629,0
+DA:2632,0
+DA:2637,0
+DA:2638,0
+DA:2642,0
+DA:2643,0
+DA:2644,0
+DA:2645,0
+DA:2646,0
+DA:2647,0
+DA:2648,0
+DA:2649,0
+DA:2651,0
+DA:2652,0
+DA:2654,0
+DA:2655,0
+DA:2656,0
+DA:2657,0
+DA:2659,0
+DA:2660,0
+DA:2661,0
+DA:2681,0
+DA:2683,0
+DA:2685,0
+DA:2688,0
+DA:2691,0
+DA:2692,0
+DA:2693,0
+DA:2697,0
+DA:2699,0
+DA:2720,0
+DA:2725,0
+DA:2727,0
+DA:2728,0
+DA:2731,0
+DA:2732,0
+DA:2733,0
+DA:2734,0
+DA:2740,0
+DA:2745,0
+DA:2747,0
+DA:2748,0
+DA:2749,0
+DA:2750,0
+DA:2751,0
+DA:2752,0
+DA:2753,0
+DA:2754,0
+DA:2755,0
+DA:2756,0
+DA:2757,0
+DA:2758,0
+DA:2759,0
+DA:2763,0
+DA:2765,0
+DA:2766,0
+DA:2787,0
+DA:2792,0
+DA:2794,0
+DA:2795,0
+DA:2798,0
+DA:2799,0
+DA:2800,0
+DA:2801,0
+DA:2804,0
+DA:2810,0
+DA:2812,0
+DA:2813,0
+DA:2814,0
+DA:2815,0
+DA:2816,0
+DA:2817,0
+DA:2818,0
+DA:2819,0
+DA:2820,0
+DA:2821,0
+DA:2822,0
+DA:2823,0
+DA:2824,0
+DA:2828,0
+DA:2830,0
+DA:2831,0
+DA:2853,0
+DA:2855,0
+DA:2856,0
+DA:2860,0
+DA:2861,0
+DA:2867,0
+DA:2869,0
+DA:2871,0
+DA:2872,0
+DA:2877,0
+DA:2878,0
+DA:2879,0
+DA:2880,0
+DA:2881,0
+DA:2886,0
+DA:2887,0
+DA:2888,0
+DA:2891,0
+DA:2892,0
+DA:2893,0
+DA:2896,0
+DA:2897,0
+DA:2900,0
+DA:2901,0
+DA:2902,0
+DA:2903,0
+DA:2904,0
+DA:2906,0
+DA:2908,0
+DA:2909,0
+DA:2910,0
+DA:2911,0
+DA:2912,0
+DA:2913,0
+DA:2914,0
+DA:2915,0
+DA:2918,0
+DA:2919,0
+DA:2920,0
+DA:2921,0
+DA:2922,0
+DA:2923,0
+DA:2924,0
+DA:2926,0
+DA:2927,0
+DA:2928,0
+DA:2929,0
+DA:2932,0
+DA:2933,0
+DA:2934,0
+DA:2935,0
+DA:2936,0
+DA:2938,0
+DA:2941,0
+DA:2942,0
+DA:2944,0
+DA:2945,0
+DA:2948,0
+DA:2949,0
+DA:2950,0
+DA:2951,0
+DA:2954,0
+DA:2955,0
+DA:2956,0
+DA:2957,0
+DA:2958,0
+DA:2959,0
+DA:2961,0
+DA:2962,0
+DA:2964,0
+DA:2965,0
+DA:2966,0
+DA:2967,0
+DA:2968,0
+DA:2972,0
+DA:2973,0
+DA:2975,0
+DA:2978,0
+DA:2979,0
+DA:2980,0
+DA:2981,0
+DA:2988,0
+DA:2989,0
+DA:2990,0
+DA:2991,0
+DA:2992,0
+DA:2993,0
+DA:2994,0
+DA:2995,0
+DA:2996,0
+DA:2999,0
+DA:3002,0
+DA:3003,0
+DA:3004,0
+DA:3005,0
+DA:3006,0
+DA:3007,0
+DA:3008,0
+DA:3012,0
+DA:3013,0
+DA:3014,0
+DA:3022,0
+DA:3023,0
+DA:3024,0
+DA:3025,0
+DA:3026,0
+DA:3027,0
+DA:3028,0
+DA:3029,0
+DA:3030,0
+DA:3035,0
+DA:3036,0
+DA:3041,0
+DA:3042,0
+DA:3043,0
+DA:3044,0
+DA:3045,0
+DA:3051,0
+DA:3052,0
+DA:3053,0
+DA:3054,0
+DA:3056,0
+DA:3057,0
+DA:3060,0
+DA:3061,0
+DA:3062,0
+DA:3063,0
+DA:3066,0
+DA:3067,0
+DA:3068,0
+DA:3069,0
+DA:3070,0
+DA:3071,0
+DA:3072,0
+DA:3073,0
+DA:3074,0
+DA:3075,0
+LF:1371
+LH:8
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-qfunction-register.c
+FNL:0,30,44
+FNA:0,192,CeedQFunctionRegisterAll
+FNF:1
+FNH:1
+DA:30,192
+DA:31,192
+DA:34,192
+DA:35,96
+DA:40,96
+DA:43,192
+LF:6
+LH:6
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-qfunction.c
+FNL:0,1023,1026
+FNA:0,0,CeedQFunctionSetContextWritable
+FNL:1,1036,1040
+FNA:1,192,CeedQFunctionSetUserFlopsEstimate
+FNL:2,1052,1055
+FNA:2,0,CeedQFunctionSetNumViewTabs
+FNL:3,106,112
+FNA:3,1152,CeedQFunctionFieldSet
+FNL:4,1067,1070
+FNA:4,0,CeedQFunctionGetNumViewTabs
+FNL:5,1082,1108
+FNA:5,0,CeedQFunctionView
+FNL:6,1120,1123
+FNA:6,480,CeedQFunctionGetCeed
+FNL:7,1134,1134
+FNA:7,0,CeedQFunctionReturnCeed
+FNL:8,1150,1160
+FNA:8,3072,CeedQFunctionApply
+FNL:9,1171,1202
+FNA:9,10368,CeedQFunctionDestroy
+FNL:10,127,143
+FNA:10,0,CeedQFunctionFieldView
+FNL:11,155,158
+FNA:11,0,CeedQFunctionView_Object
+FNL:12,169,172
+FNA:12,0,CeedQFunctionDestroy_Object
+FNL:13,184,187
+FNA:13,96,CeedQFunctionSetFortranStatus
+FNL:14,207,210
+FNA:14,3072,CeedQFunctionGetVectorLength
+FNL:15,223,227
+FNA:15,3072,CeedQFunctionGetNumArgs
+FNL:16,240,247
+FNA:16,0,CeedQFunctionGetName
+FNL:17,259,277
+FNA:17,768,CeedQFunctionGetKernelName
+FNL:18,289,318
+FNA:18,768,CeedQFunctionGetSourcePath
+FNL:19,338,353
+FNA:19,0,CeedQFunctionLoadSourceToBuffer
+FNL:20,365,368
+FNA:20,3072,CeedQFunctionGetUserFunction
+FNL:21,382,386
+FNA:21,6192,CeedQFunctionGetContext
+FNL:22,400,417
+FNA:22,3072,CeedQFunctionGetContextData
+FNL:23,429,444
+FNA:23,3072,CeedQFunctionRestoreContextData
+FNL:24,457,472
+FNA:24,0,CeedQFunctionGetInnerContext
+FNL:25,486,502
+FNA:25,0,CeedQFunctionGetInnerContextData
+FNL:26,514,529
+FNA:26,0,CeedQFunctionRestoreInnerContextData
+FNL:27,541,544
+FNA:27,384,CeedQFunctionIsIdentity
+FNL:28,556,559
+FNA:28,2304,CeedQFunctionIsContextWritable
+FNL:29,571,574
+FNA:29,3456,CeedQFunctionGetData
+FNL:30,586,589
+FNA:30,384,CeedQFunctionSetData
+FNL:31,601,604
+FNA:31,1152,CeedQFunctionIsImmutable
+FNL:32,615,618
+FNA:32,9216,CeedQFunctionSetImmutable
+FNL:33,629,632
+FNA:33,4608,CeedQFunctionReference
+FNL:34,64,87
+FNA:34,1536,CeedQFunctionRegister
+FNL:35,642,645
+FNA:35,0,CeedQFunctionGetFlopsEstimate
+FNL:36,677,711
+FNA:36,720,CeedQFunctionCreateInterior
+FNL:37,724,753
+FNA:37,192,CeedQFunctionCreateInteriorByName
+FNL:38,772,787
+FNA:38,0,CeedQFunctionCreateIdentity
+FNL:39,804,809
+FNA:39,4608,CeedQFunctionReferenceCopy
+FNL:40,834,851
+FNA:40,768,CeedQFunctionAddInput
+FNL:41,876,894
+FNA:41,384,CeedQFunctionAddOutput
+FNL:42,911,919
+FNA:42,5760,CeedQFunctionGetFields
+FNL:43,931,934
+FNA:43,4224,CeedQFunctionFieldGetName
+FNL:44,946,949
+FNA:44,9792,CeedQFunctionFieldGetSize
+FNL:45,961,964
+FNA:45,14784,CeedQFunctionFieldGetEvalMode
+FNL:46,980,985
+FNA:46,1152,CeedQFunctionFieldGetData
+FNL:47,997,1002
+FNA:47,144,CeedQFunctionSetContext
+FNF:48
+FNH:33
+DA:64,1536
+DA:67,1536
+DA:69,1536
+DA:70,1536
+DA:72,1536
+DA:73,1536
+DA:74,1536
+DA:75,1536
+DA:76,1536
+DA:77,1536
+DA:78,1536
+DA:79,1536
+DA:80,1536
+DA:82,0
+DA:85,1536
+DA:86,1536
+DA:106,1152
+DA:107,1152
+DA:108,1152
+DA:109,1152
+DA:110,1152
+DA:111,1152
+DA:127,0
+DA:128,0
+DA:133,0
+DA:134,0
+DA:141,0
+DA:142,0
+DA:155,0
+DA:156,0
+DA:157,0
+DA:169,0
+DA:170,0
+DA:171,0
+DA:184,96
+DA:185,96
+DA:186,96
+DA:207,3072
+DA:208,3072
+DA:209,3072
+DA:223,3072
+DA:224,3072
+DA:225,3072
+DA:226,3072
+DA:240,0
+DA:241,0
+DA:242,0
+DA:244,0
+DA:246,0
+DA:259,768
+DA:260,768
+DA:263,96
+DA:264,96
+DA:265,96
+DA:267,96
+DA:268,96
+DA:270,0
+DA:272,96
+DA:275,768
+DA:276,768
+DA:289,768
+DA:290,768
+DA:295,96
+DA:296,96
+DA:298,96
+DA:299,96
+DA:300,96
+DA:301,48
+DA:303,48
+DA:305,96
+DA:307,96
+DA:309,96
+DA:310,96
+DA:311,96
+DA:313,96
+DA:316,768
+DA:317,768
+DA:338,0
+DA:341,0
+DA:342,0
+DA:343,0
+DA:345,0
+DA:347,0
+DA:348,0
+DA:349,0
+DA:350,0
+DA:352,0
+DA:365,3072
+DA:366,3072
+DA:367,3072
+DA:382,6192
+DA:383,6192
+DA:384,6192
+DA:385,6192
+DA:400,3072
+DA:404,3072
+DA:405,3072
+DA:406,1152
+DA:407,1152
+DA:408,1152
+DA:410,0
+DA:413,1920
+DA:415,3072
+DA:416,3072
+DA:429,3072
+DA:433,3072
+DA:434,3072
+DA:435,1152
+DA:436,1152
+DA:437,1152
+DA:439,0
+DA:442,3072
+DA:443,3072
+DA:457,0
+DA:460,0
+DA:461,0
+DA:462,0
+DA:464,0
+DA:465,0
+DA:466,0
+DA:468,0
+DA:470,0
+DA:471,0
+DA:486,0
+DA:490,0
+DA:491,0
+DA:492,0
+DA:493,0
+DA:494,0
+DA:496,0
+DA:499,0
+DA:501,0
+DA:514,0
+DA:518,0
+DA:519,0
+DA:520,0
+DA:521,0
+DA:522,0
+DA:524,0
+DA:527,0
+DA:528,0
+DA:541,384
+DA:542,384
+DA:543,384
+DA:556,2304
+DA:557,2304
+DA:558,2304
+DA:571,3456
+DA:572,3456
+DA:573,3456
+DA:586,384
+DA:587,384
+DA:588,384
+DA:601,1152
+DA:602,1152
+DA:603,1152
+DA:615,9216
+DA:616,9216
+DA:617,9216
+DA:629,4608
+DA:630,4608
+DA:631,4608
+DA:642,0
+DA:643,0
+DA:644,0
+DA:677,720
+DA:680,720
+DA:683,336
+DA:684,336
+DA:685,336
+DA:686,336
+DA:687,336
+DA:690,384
+DA:693,384
+DA:694,384
+DA:695,384
+DA:696,384
+DA:697,384
+DA:698,384
+DA:699,384
+DA:700,384
+DA:701,384
+DA:703,384
+DA:704,384
+DA:705,384
+DA:707,384
+DA:708,384
+DA:709,384
+DA:710,384
+DA:724,192
+DA:725,192
+DA:727,192
+DA:729,192
+DA:730,3264
+DA:732,3072
+DA:733,7296
+DA:735,3072
+DA:736,352
+DA:737,352
+DA:740,192
+DA:743,192
+DA:747,192
+DA:750,192
+DA:751,192
+DA:752,192
+DA:772,0
+DA:776,0
+DA:777,0
+DA:778,0
+DA:780,0
+DA:782,0
+DA:783,0
+DA:784,0
+DA:785,0
+DA:786,0
+DA:804,4608
+DA:805,4608
+DA:806,4608
+DA:807,4608
+DA:808,4608
+DA:834,768
+DA:837,768
+DA:838,768
+DA:839,768
+DA:840,1152
+DA:841,384
+DA:844,768
+DA:845,0
+DA:848,768
+DA:849,768
+DA:850,768
+DA:876,384
+DA:879,384
+DA:880,384
+DA:881,384
+DA:883,1152
+DA:884,768
+DA:887,384
+DA:888,0
+DA:891,384
+DA:892,384
+DA:893,384
+DA:911,5760
+DA:913,5760
+DA:914,5760
+DA:915,5760
+DA:916,5760
+DA:917,5760
+DA:918,5760
+DA:931,4224
+DA:932,4224
+DA:933,4224
+DA:946,9792
+DA:947,9792
+DA:948,9792
+DA:961,14784
+DA:962,14784
+DA:963,14784
+DA:980,1152
+DA:981,1152
+DA:982,1152
+DA:983,1152
+DA:984,1152
+DA:997,144
+DA:998,144
+DA:999,144
+DA:1000,144
+DA:1001,144
+DA:1023,0
+DA:1024,0
+DA:1025,0
+DA:1036,192
+DA:1037,192
+DA:1038,192
+DA:1039,192
+DA:1052,0
+DA:1053,0
+DA:1054,0
+DA:1067,0
+DA:1068,0
+DA:1069,0
+DA:1082,0
+DA:1083,0
+DA:1087,0
+DA:1089,0
+DA:1090,0
+DA:1091,0
+DA:1094,0
+DA:1095,0
+DA:1097,0
+DA:1098,0
+DA:1099,0
+DA:1102,0
+DA:1103,0
+DA:1104,0
+DA:1106,0
+DA:1107,0
+DA:1120,480
+DA:1121,480
+DA:1122,480
+DA:1134,0
+DA:1150,3072
+DA:1153,3072
+DA:1154,3072
+DA:1155,3072
+DA:1157,3072
+DA:1158,3072
+DA:1159,3072
+DA:1171,10368
+DA:1172,10368
+DA:1173,9984
+DA:1174,9984
+DA:1177,384
+DA:1178,384
+DA:1181,1152
+DA:1182,768
+DA:1183,768
+DA:1185,768
+DA:1186,384
+DA:1187,384
+DA:1189,384
+DA:1190,384
+DA:1193,384
+DA:1195,384
+DA:1196,384
+DA:1197,384
+DA:1198,384
+DA:1199,384
+DA:1200,384
+DA:1201,384
+LF:330
+LH:230
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-qfunctioncontext.c
+FNL:0,111,128
+FNA:0,576,CeedQFunctionContextDestroyData
+FNL:1,140,143
+FNA:1,0,CeedQFunctionContextView_Object
+FNL:2,154,157
+FNA:2,0,CeedQFunctionContextDestroy_Object
+FNL:3,177,180
+FNA:3,288,CeedQFunctionContextGetCeed
+FNL:4,191,191
+FNA:4,0,CeedQFunctionContextReturnCeed
+FNL:5,203,208
+FNA:5,1584,CeedQFunctionContextHasValidData
+FNL:6,221,226
+FNA:6,0,CeedQFunctionContextHasBorrowedDataOfType
+FNL:7,238,241
+FNA:7,0,CeedQFunctionContextGetState
+FNL:8,253,256
+FNA:8,4680,CeedQFunctionContextGetBackendData
+FNL:9,268,271
+FNA:9,288,CeedQFunctionContextSetBackendData
+FNL:10,284,295
+FNA:10,0,CeedQFunctionContextGetFieldLabel
+FNL:11,309,327
+FNA:11,0,CeedQFunctionContextSetGeneric
+FNL:12,342,365
+FNA:12,0,CeedQFunctionContextGetGenericRead
+FNL:13,36,42
+FNA:13,0,CeedQFunctionContextGetFieldIndex
+FNL:14,379,388
+FNA:14,0,CeedQFunctionContextRestoreGenericRead
+FNL:15,401,405
+FNA:15,0,CeedQFunctionContextSetDouble
+FNL:16,419,423
+FNA:16,0,CeedQFunctionContextGetDoubleRead
+FNL:17,436,440
+FNA:17,0,CeedQFunctionContextRestoreDoubleRead
+FNL:18,453,457
+FNA:18,0,CeedQFunctionContextSetInt32
+FNL:19,471,475
+FNA:19,0,CeedQFunctionContextGetInt32Read
+FNL:20,488,492
+FNA:20,0,CeedQFunctionContextRestoreInt32Read
+FNL:21,505,509
+FNA:21,0,CeedQFunctionContextSetBoolean
+FNL:22,523,527
+FNA:22,0,CeedQFunctionContextGetBooleanRead
+FNL:23,540,544
+FNA:23,0,CeedQFunctionContextRestoreBooleanRead
+FNL:24,557,561
+FNA:24,576,CeedQFunctionContextGetDataDestroy
+FNL:25,572,575
+FNA:25,2496,CeedQFunctionContextReference
+FNL:26,58,100
+FNA:26,0,CeedQFunctionContextRegisterGeneric
+FNL:27,595,610
+FNA:27,540,CeedQFunctionContextCreate
+FNL:28,627,632
+FNA:28,2352,CeedQFunctionContextReferenceCopy
+FNL:29,650,660
+FNA:29,288,CeedQFunctionContextSetData
+FNL:30,676,694
+FNA:30,0,CeedQFunctionContextTakeData
+FNL:31,713,728
+FNA:31,1584,CeedQFunctionContextGetData
+FNL:32,747,761
+FNA:32,0,CeedQFunctionContextGetDataRead
+FNL:33,773,780
+FNA:33,1584,CeedQFunctionContextRestoreData
+FNL:34,792,799
+FNA:34,0,CeedQFunctionContextRestoreDataRead
+FNL:35,814,817
+FNA:35,0,CeedQFunctionContextRegisterDouble
+FNL:36,832,835
+FNA:36,0,CeedQFunctionContextRegisterInt32
+FNL:37,850,853
+FNA:37,0,CeedQFunctionContextRegisterBoolean
+FNL:38,866,870
+FNA:38,0,CeedQFunctionContextGetAllFieldLabels
+FNL:39,886,894
+FNA:39,0,CeedContextFieldLabelGetDescription
+FNL:40,906,909
+FNA:40,1476,CeedQFunctionContextGetContextSize
+FNL:41,921,924
+FNA:41,0,CeedQFunctionContextSetNumViewTabs
+FNL:42,936,939
+FNA:42,0,CeedQFunctionContextGetNumViewTabs
+FNL:43,951,969
+FNA:43,0,CeedQFunctionContextView
+FNL:44,982,987
+FNA:44,0,CeedQFunctionContextSetDataDestroy
+FNL:45,998,1016
+FNA:45,9360,CeedQFunctionContextDestroy
+FNF:46
+FNH:14
+DA:36,0
+DA:37,0
+DA:38,0
+DA:39,0
+DA:41,0
+DA:58,0
+DA:60,0
+DA:61,0
+DA:64,0
+DA:65,0
+DA:69,0
+DA:70,0
+DA:71,0
+DA:72,0
+DA:73,0
+DA:74,0
+DA:76,0
+DA:79,0
+DA:80,0
+DA:81,0
+DA:82,0
+DA:83,0
+DA:84,0
+DA:85,0
+DA:86,0
+DA:87,0
+DA:88,0
+DA:92,0
+DA:93,0
+DA:94,0
+DA:95,0
+DA:96,0
+DA:97,0
+DA:98,0
+DA:99,0
+DA:111,576
+DA:112,576
+DA:113,144
+DA:118,432
+DA:119,432
+DA:122,0
+DA:123,0
+DA:124,0
+DA:127,576
+DA:140,0
+DA:141,0
+DA:142,0
+DA:154,0
+DA:155,0
+DA:156,0
+DA:177,288
+DA:178,288
+DA:179,288
+DA:191,0
+DA:203,1584
+DA:204,1584
+DA:206,1584
+DA:207,1584
+DA:221,0
+DA:222,0
+DA:224,0
+DA:225,0
+DA:238,0
+DA:239,0
+DA:240,0
+DA:253,4680
+DA:254,4680
+DA:255,4680
+DA:268,288
+DA:269,288
+DA:270,288
+DA:284,0
+DA:287,0
+DA:289,0
+DA:290,0
+DA:292,0
+DA:294,0
+DA:309,0
+DA:314,0
+DA:318,0
+DA:319,0
+DA:320,0
+DA:321,0
+DA:322,0
+DA:323,0
+DA:324,0
+DA:326,0
+DA:342,0
+DA:347,0
+DA:351,0
+DA:352,0
+DA:353,0
+DA:354,0
+DA:355,0
+DA:356,0
+DA:357,0
+DA:358,0
+DA:359,0
+DA:360,0
+DA:361,0
+DA:362,0
+DA:364,0
+DA:379,0
+DA:382,0
+DA:386,0
+DA:387,0
+DA:401,0
+DA:402,0
+DA:403,0
+DA:404,0
+DA:419,0
+DA:420,0
+DA:421,0
+DA:422,0
+DA:436,0
+DA:437,0
+DA:438,0
+DA:439,0
+DA:453,0
+DA:454,0
+DA:455,0
+DA:456,0
+DA:471,0
+DA:472,0
+DA:473,0
+DA:474,0
+DA:488,0
+DA:489,0
+DA:490,0
+DA:491,0
+DA:505,0
+DA:506,0
+DA:507,0
+DA:508,0
+DA:523,0
+DA:524,0
+DA:525,0
+DA:526,0
+DA:540,0
+DA:541,0
+DA:542,0
+DA:543,0
+DA:557,576
+DA:558,576
+DA:559,576
+DA:560,576
+DA:572,2496
+DA:573,2496
+DA:574,2496
+DA:595,540
+DA:596,540
+DA:599,252
+DA:600,252
+DA:601,252
+DA:602,252
+DA:603,252
+DA:606,288
+DA:607,288
+DA:608,288
+DA:609,288
+DA:627,2352
+DA:628,2352
+DA:629,2352
+DA:630,2352
+DA:631,2352
+DA:650,288
+DA:651,288
+DA:652,288
+DA:655,288
+DA:656,288
+DA:657,288
+DA:658,288
+DA:659,288
+DA:676,0
+DA:677,0
+DA:678,0
+DA:680,0
+DA:681,0
+DA:683,0
+DA:684,0
+DA:687,0
+DA:688,0
+DA:691,0
+DA:692,0
+DA:693,0
+DA:713,1584
+DA:714,1584
+DA:716,1584
+DA:717,1584
+DA:719,1584
+DA:722,1584
+DA:723,1584
+DA:725,1584
+DA:726,1584
+DA:727,1584
+DA:747,0
+DA:748,0
+DA:750,0
+DA:752,0
+DA:755,0
+DA:756,0
+DA:758,0
+DA:759,0
+DA:760,0
+DA:773,1584
+DA:774,1584
+DA:776,1584
+DA:777,1584
+DA:778,1584
+DA:779,1584
+DA:792,0
+DA:793,0
+DA:795,0
+DA:796,0
+DA:797,0
+DA:798,0
+DA:814,0
+DA:816,0
+DA:832,0
+DA:834,0
+DA:850,0
+DA:852,0
+DA:866,0
+DA:867,0
+DA:868,0
+DA:869,0
+DA:886,0
+DA:888,0
+DA:889,0
+DA:890,0
+DA:891,0
+DA:892,0
+DA:893,0
+DA:906,1476
+DA:907,1476
+DA:908,1476
+DA:921,0
+DA:922,0
+DA:923,0
+DA:936,0
+DA:937,0
+DA:938,0
+DA:951,0
+DA:952,0
+DA:955,0
+DA:957,0
+DA:958,0
+DA:959,0
+DA:962,0
+DA:963,0
+DA:964,0
+DA:965,0
+DA:967,0
+DA:968,0
+DA:982,0
+DA:983,0
+DA:984,0
+DA:985,0
+DA:986,0
+DA:998,9360
+DA:999,9360
+DA:1000,9072
+DA:1001,9072
+DA:1003,288
+DA:1005,288
+DA:1006,288
+DA:1007,288
+DA:1008,0
+DA:1009,0
+DA:1010,0
+DA:1012,288
+DA:1013,288
+DA:1014,288
+DA:1015,288
+LF:274
+LH:81
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-register.c
+FNL:0,30,44
+FNA:0,432,CeedRegisterAll
+FNF:1
+FNH:1
+DA:30,432
+DA:31,432
+DA:34,432
+DA:35,192
+DA:40,192
+DA:43,432
+LF:6
+LH:6
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-tensor.c
+FNL:0,124,136
+FNA:0,0,CeedTensorContractStridedApply
+FNL:1,148,151
+FNA:1,192,CeedTensorContractGetCeed
+FNL:2,162,162
+FNA:2,192,CeedTensorContractReturnCeed
+FNL:3,174,177
+FNA:3,0,CeedTensorContractGetData
+FNL:4,189,192
+FNA:4,0,CeedTensorContractSetData
+FNL:5,203,206
+FNA:5,384,CeedTensorContractReference
+FNL:6,223,228
+FNA:6,0,CeedTensorContractReferenceCopy
+FNL:7,239,250
+FNA:7,768,CeedTensorContractDestroy
+FNL:8,31,34
+FNA:8,0,CeedTensorContractDestroy_Object
+FNL:9,54,69
+FNA:9,576,CeedTensorContractCreate
+FNL:10,94,98
+FNA:10,8064,CeedTensorContractApply
+FNF:11
+FNH:6
+DA:31,0
+DA:32,0
+DA:33,0
+DA:54,576
+DA:55,576
+DA:58,192
+DA:59,192
+DA:60,192
+DA:61,192
+DA:62,192
+DA:65,384
+DA:66,384
+DA:67,384
+DA:68,384
+DA:94,8064
+DA:96,8064
+DA:97,8064
+DA:124,0
+DA:126,0
+DA:127,0
+DA:128,0
+DA:131,0
+DA:132,0
+DA:135,0
+DA:148,192
+DA:149,192
+DA:150,192
+DA:162,192
+DA:174,0
+DA:175,0
+DA:176,0
+DA:189,0
+DA:190,0
+DA:191,0
+DA:203,384
+DA:204,384
+DA:205,384
+DA:223,0
+DA:224,0
+DA:225,0
+DA:226,0
+DA:227,0
+DA:239,768
+DA:240,768
+DA:241,384
+DA:242,384
+DA:244,384
+DA:245,192
+DA:247,384
+DA:248,384
+DA:249,384
+LF:51
+LH:30
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed-vector.c
+FNL:0,1002,1031
+FNA:0,0,CeedVectorReciprocal
+FNL:1,1043,1046
+FNA:1,0,CeedVectorSetNumViewTabs
+FNL:2,1058,1061
+FNA:2,0,CeedVectorGetNumViewTabs
+FNL:3,1080,1111
+FNA:3,0,CeedVectorViewRange
+FNL:4,1124,1130
+FNA:4,0,CeedVectorView
+FNL:5,113,118
+FNA:5,0,CeedVectorHasBorrowedArrayOfType
+FNL:6,1142,1145
+FNA:6,3648,CeedVectorGetCeed
+FNL:7,1156,1156
+FNA:7,0,CeedVectorReturnCeed
+FNL:8,1168,1171
+FNA:8,123936,CeedVectorGetLength
+FNL:9,1182,1194
+FNA:9,20352,CeedVectorDestroy
+FNL:10,130,133
+FNA:10,384,CeedVectorGetState
+FNL:11,145,148
+FNA:11,74688,CeedVectorGetData
+FNL:12,160,163
+FNA:12,3648,CeedVectorSetData
+FNL:13,174,177
+FNA:13,1632,CeedVectorReference
+FNL:14,198,216
+FNA:14,6120,CeedVectorCreate
+FNL:15,233,238
+FNA:15,8928,CeedVectorReferenceCopy
+FNL:16,250,285
+FNA:16,0,CeedVectorCopy
+FNL:17,300,336
+FNA:17,0,CeedVectorCopyStrided
+FNL:18,353,365
+FNA:18,6744,CeedVectorSetArray
+FNL:19,377,395
+FNA:19,1056,CeedVectorSetValue
+FNL:20,412,435
+FNA:20,0,CeedVectorSetValueStrided
+FNL:21,450,468
+FNA:21,0,CeedVectorSyncArray
+FNL:22,485,508
+FNA:22,0,CeedVectorTakeArray
+FNL:23,52,55
+FNA:23,0,CeedVectorView_Object
+FNL:24,527,549
+FNA:24,1824,CeedVectorGetArray
+FNL:25,565,586
+FNA:25,14976,CeedVectorGetArrayRead
+FNL:26,602,618
+FNA:26,11088,CeedVectorGetArrayWrite
+FNL:27,630,639
+FNA:27,12912,CeedVectorRestoreArray
+FNL:28,651,661
+FNA:28,14976,CeedVectorRestoreArrayRead
+FNL:29,66,69
+FNA:29,0,CeedVectorDestroy_Object
+FNL:30,677,723
+FNA:30,0,CeedVectorNorm
+FNL:31,735,757
+FNA:31,0,CeedVectorScale
+FNL:32,770,827
+FNA:32,0,CeedVectorAXPY
+FNL:33,841,898
+FNA:33,0,CeedVectorAXPBY
+FNL:34,89,100
+FNA:34,16800,CeedVectorHasValidArray
+FNL:35,913,991
+FNA:35,0,CeedVectorPointwiseMult
+FNF:36
+FNH:17
+DA:52,0
+DA:53,0
+DA:54,0
+DA:66,0
+DA:67,0
+DA:68,0
+DA:89,16800
+DA:92,16800
+DA:93,16800
+DA:94,16800
+DA:95,0
+DA:96,0
+DA:98,16800
+DA:99,16800
+DA:113,0
+DA:114,0
+DA:116,0
+DA:117,0
+DA:130,384
+DA:131,384
+DA:132,384
+DA:145,74688
+DA:146,74688
+DA:147,74688
+DA:160,3648
+DA:161,3648
+DA:162,3648
+DA:174,1632
+DA:175,1632
+DA:176,1632
+DA:198,6120
+DA:199,6120
+DA:200,6120
+DA:203,2472
+DA:204,2472
+DA:205,2472
+DA:206,2472
+DA:207,2472
+DA:210,3648
+DA:211,3648
+DA:212,3648
+DA:213,3648
+DA:214,3648
+DA:215,3648
+DA:233,8928
+DA:234,8928
+DA:235,8928
+DA:236,8928
+DA:237,8928
+DA:250,0
+DA:258,0
+DA:259,0
+DA:260,0
+DA:262,0
+DA:263,0
+DA:264,0
+DA:268,0
+DA:274,0
+DA:275,0
+DA:276,0
+DA:280,0
+DA:281,0
+DA:283,0
+DA:284,0
+DA:300,0
+DA:302,0
+DA:303,0
+DA:309,0
+DA:310,0
+DA:311,0
+DA:312,0
+DA:314,0
+DA:316,0
+DA:320,0
+DA:321,0
+DA:322,0
+DA:323,0
+DA:327,0
+DA:328,0
+DA:329,0
+DA:330,0
+DA:333,0
+DA:334,0
+DA:335,0
+DA:353,6744
+DA:356,6744
+DA:357,6744
+DA:359,6744
+DA:361,6744
+DA:362,6744
+DA:363,6744
+DA:364,6744
+DA:377,1056
+DA:378,1056
+DA:380,1056
+DA:382,1056
+DA:383,144
+DA:384,144
+DA:389,912
+DA:390,912
+DA:391,274224
+DA:392,912
+DA:394,1056
+DA:412,0
+DA:415,0
+DA:417,0
+DA:418,0
+DA:419,0
+DA:422,0
+DA:423,0
+DA:424,0
+DA:428,0
+DA:429,0
+DA:430,0
+DA:431,0
+DA:432,0
+DA:434,0
+DA:450,0
+DA:453,0
+DA:456,0
+DA:457,0
+DA:459,0
+DA:460,0
+DA:464,0
+DA:465,0
+DA:467,0
+DA:485,0
+DA:487,0
+DA:489,0
+DA:490,0
+DA:492,0
+DA:493,0
+DA:494,0
+DA:496,0
+DA:497,0
+DA:500,0
+DA:501,0
+DA:504,0
+DA:506,0
+DA:507,0
+DA:527,1824
+DA:530,1824
+DA:531,1824
+DA:533,1824
+DA:535,1824
+DA:536,1824
+DA:537,1824
+DA:539,1824
+DA:540,1824
+DA:543,1824
+DA:545,0
+DA:547,1824
+DA:548,1824
+DA:565,14976
+DA:568,14976
+DA:569,14976
+DA:572,14976
+DA:573,14976
+DA:574,14976
+DA:576,14976
+DA:577,14976
+DA:580,14976
+DA:582,0
+DA:584,14976
+DA:585,14976
+DA:602,11088
+DA:605,11088
+DA:606,11088
+DA:608,11088
+DA:610,11088
+DA:611,11088
+DA:612,11088
+DA:614,0
+DA:616,11088
+DA:617,11088
+DA:630,12912
+DA:633,12912
+DA:634,12912
+DA:635,12912
+DA:636,12912
+DA:637,12912
+DA:638,12912
+DA:651,14976
+DA:654,14976
+DA:656,14976
+DA:657,14976
+DA:658,14976
+DA:659,14976
+DA:660,14976
+DA:677,0
+DA:678,0
+DA:681,0
+DA:682,0
+DA:685,0
+DA:686,0
+DA:687,0
+DA:688,0
+DA:692,0
+DA:693,0
+DA:694,0
+DA:698,0
+DA:699,0
+DA:701,0
+DA:702,0
+DA:703,0
+DA:704,0
+DA:705,0
+DA:707,0
+DA:708,0
+DA:709,0
+DA:710,0
+DA:712,0
+DA:713,0
+DA:714,0
+DA:715,0
+DA:716,0
+DA:719,0
+DA:721,0
+DA:722,0
+DA:735,0
+DA:736,0
+DA:738,0
+DA:740,0
+DA:741,0
+DA:745,0
+DA:746,0
+DA:749,0
+DA:752,0
+DA:753,0
+DA:754,0
+DA:755,0
+DA:756,0
+DA:770,0
+DA:771,0
+DA:773,0
+DA:774,0
+DA:776,0
+DA:777,0
+DA:778,0
+DA:782,0
+DA:784,0
+DA:785,0
+DA:787,0
+DA:788,0
+DA:794,0
+DA:795,0
+DA:796,0
+DA:797,0
+DA:798,0
+DA:799,0
+DA:800,0
+DA:802,0
+DA:803,0
+DA:807,0
+DA:810,0
+DA:811,0
+DA:812,0
+DA:816,0
+DA:817,0
+DA:819,0
+DA:820,0
+DA:822,0
+DA:824,0
+DA:825,0
+DA:826,0
+DA:841,0
+DA:842,0
+DA:844,0
+DA:845,0
+DA:847,0
+DA:848,0
+DA:849,0
+DA:853,0
+DA:855,0
+DA:856,0
+DA:858,0
+DA:859,0
+DA:865,0
+DA:866,0
+DA:867,0
+DA:868,0
+DA:869,0
+DA:870,0
+DA:871,0
+DA:873,0
+DA:874,0
+DA:878,0
+DA:881,0
+DA:882,0
+DA:883,0
+DA:887,0
+DA:888,0
+DA:890,0
+DA:891,0
+DA:893,0
+DA:895,0
+DA:896,0
+DA:897,0
+DA:913,0
+DA:914,0
+DA:915,0
+DA:916,0
+DA:919,0
+DA:920,0
+DA:921,0
+DA:922,0
+DA:930,0
+DA:931,0
+DA:932,0
+DA:933,0
+DA:934,0
+DA:935,0
+DA:936,0
+DA:937,0
+DA:938,0
+DA:939,0
+DA:941,0
+DA:942,0
+DA:943,0
+DA:946,0
+DA:947,0
+DA:949,0
+DA:950,0
+DA:954,0
+DA:957,0
+DA:958,0
+DA:959,0
+DA:963,0
+DA:964,0
+DA:966,0
+DA:968,0
+DA:969,0
+DA:971,0
+DA:973,0
+DA:974,0
+DA:975,0
+DA:976,0
+DA:977,0
+DA:978,0
+DA:981,0
+DA:982,0
+DA:983,0
+DA:985,0
+DA:987,0
+DA:988,0
+DA:989,0
+DA:990,0
+DA:1002,0
+DA:1003,0
+DA:1007,0
+DA:1008,0
+DA:1012,0
+DA:1015,0
+DA:1016,0
+DA:1019,0
+DA:1020,0
+DA:1021,0
+DA:1024,0
+DA:1025,0
+DA:1026,0
+DA:1029,0
+DA:1030,0
+DA:1043,0
+DA:1044,0
+DA:1045,0
+DA:1058,0
+DA:1059,0
+DA:1060,0
+DA:1080,0
+DA:1082,0
+DA:1086,0
+DA:1089,0
+DA:1091,0
+DA:1092,0
+DA:1093,0
+DA:1096,0
+DA:1097,0
+DA:1098,0
+DA:1099,0
+DA:1101,0
+DA:1102,0
+DA:1104,0
+DA:1105,0
+DA:1106,0
+DA:1107,0
+DA:1108,0
+DA:1109,0
+DA:1110,0
+DA:1124,0
+DA:1127,0
+DA:1128,0
+DA:1129,0
+DA:1142,3648
+DA:1143,3648
+DA:1144,3648
+DA:1156,0
+DA:1168,123936
+DA:1169,123936
+DA:1170,123936
+DA:1182,20352
+DA:1183,20352
+DA:1184,16704
+DA:1185,16704
+DA:1187,3648
+DA:1188,3648
+DA:1190,3648
+DA:1191,3648
+DA:1192,3648
+DA:1193,3648
+LF:409
+LH:118
+end_of_record
+TN:
+SF:/home/jeremy/Dev/libCEED/interface/ceed.c
+FNL:0,1013,1022
+FNA:0,0,CeedGetRustSourceRoots
+FNL:1,1034,1042
+FNA:1,48,CeedRestoreJitSourceRoots
+FNL:2,1054,1062
+FNA:2,0,CeedRestoreRustSourceRoots
+FNL:3,1077,1086
+FNA:3,0,CeedGetJitDefines
+FNL:4,1098,1106
+FNA:4,0,CeedRestoreJitDefines
+FNL:5,1171,1377
+FNA:5,432,CeedInit
+FNL:6,123,139
+FNA:6,4416,CeedRegisterImpl
+FNL:7,1389,1402
+FNA:7,0,CeedSetStream
+FNL:8,1419,1424
+FNA:8,24420,CeedReferenceCopy
+FNL:9,1436,1439
+FNA:9,648,CeedGetResource
+FNL:10,1451,1466
+FNA:10,0,CeedGetPreferredMemType
+FNL:11,1478,1481
+FNA:11,0,CeedIsDeterministic
+FNL:12,1493,1512
+FNA:12,432,CeedAddJitSourceRoot
+FNL:13,150,153
+FNA:13,0,CeedWorkVectorsCreate
+FNL:14,1524,1545
+FNA:14,0,CeedAddRustSourceRoot
+FNL:15,1557,1576
+FNA:15,0,CeedAddJitDefine
+FNL:16,1588,1591
+FNA:16,0,CeedSetNumViewTabs
+FNL:17,1603,1606
+FNA:17,0,CeedGetNumViewTabs
+FNL:18,1618,1638
+FNA:18,0,CeedView
+FNL:19,164,179
+FNA:19,300,CeedWorkVectorsDestroy
+FNL:20,1649,1693
+FNA:20,48480,CeedDestroy
+FNL:21,1711,1732
+FNA:21,0,CeedErrorImpl
+FNL:22,1800,1807
+FNA:22,0,CeedErrorExit
+FNL:23,1819,1824
+FNA:23,0,CeedSetErrorHandler
+FNL:24,1838,1842
+FNA:24,0,CeedGetErrorMessage
+FNL:25,1856,1861
+FNA:25,0,CeedResetErrorMessage
+FNL:26,1882,1888
+FNA:26,0,CeedGetVersion
+FNL:27,1899,1902
+FNA:27,0,CeedGetScalarType
+FNL:28,191,194
+FNA:28,0,CeedView_Object
+FNL:29,205,208
+FNA:29,0,CeedDestroy_Object
+FNL:30,282,286
+FNA:30,1812,CeedMallocArray
+FNL:31,303,307
+FNA:31,43296,CeedCallocArray
+FNL:32,324,328
+FNA:32,1776,CeedReallocArray
+FNL:33,344,349
+FNA:33,2928,CeedStringAllocCopy
+FNL:34,360,364
+FNA:34,58680,CeedFree
+FNL:35,380,407
+FNA:35,5400,CeedSetHostGenericArray
+FNL:36,422,426
+FNA:36,0,CeedSetHostBoolArray
+FNL:37,441,445
+FNA:37,0,CeedSetHostCeedInt8Array
+FNL:38,460,464
+FNA:38,648,CeedSetHostCeedIntArray
+FNL:39,479,483
+FNA:39,4752,CeedSetHostCeedScalarArray
+FNL:40,498,502
+FNA:40,1536,CeedRegister
+FNL:41,514,517
+FNA:41,0,CeedIsDebug
+FNL:42,533,540
+FNA:42,0,CeedGetResourceRoot
+FNL:43,552,560
+FNA:43,4032,CeedGetParent
+FNL:44,572,576
+FNA:44,4428,CeedGetDelegate
+FNL:45,591,595
+FNA:45,240,CeedSetDelegate
+FNL:46,608,621
+FNA:46,4428,CeedGetObjectDelegate
+FNL:47,638,656
+FNA:47,0,CeedSetObjectDelegate
+FNL:48,668,678
+FNA:48,0,CeedGetOperatorFallbackCeed
+FNL:49,692,696
+FNA:49,0,CeedSetOperatorFallbackCeed
+FNL:50,708,711
+FNA:50,384,CeedSetDeterministic
+FNL:51,730,752
+FNA:51,63624,CeedSetBackendFunctionImpl
+FNL:52,764,767
+FNA:52,672,CeedGetData
+FNL:53,779,782
+FNA:53,96,CeedSetData
+FNL:54,793,796
+FNA:54,24420,CeedReference
+FNL:55,808,830
+FNA:55,0,CeedGetWorkVectorMemoryUsage
+FNL:56,842,874
+FNA:56,0,CeedClearWorkVectors
+FNL:57,889,938
+FNA:57,0,CeedGetWorkVector
+FNL:58,95,98
+FNA:58,0,CeedRequestWait
+FNL:59,950,974
+FNA:59,0,CeedRestoreWorkVector
+FNL:60,989,998
+FNA:60,48,CeedGetJitSourceRoots
+FNF:61
+FNH:27
+DA:95,0
+DA:96,0
+DA:97,0
+DA:123,4416
+DA:124,4416
+DA:127,4416
+DA:128,4416
+DA:129,4416
+DA:130,4416
+DA:131,4416
+DA:132,4416
+DA:134,0
+DA:137,4416
+DA:138,4416
+DA:150,0
+DA:151,0
+DA:152,0
+DA:164,300
+DA:165,300
+DA:166,0
+DA:167,0
+DA:169,0
+DA:170,0
+DA:171,0
+DA:173,0
+DA:175,0
+DA:176,0
+DA:177,0
+DA:178,0
+DA:191,0
+DA:192,0
+DA:193,0
+DA:205,0
+DA:206,0
+DA:207,0
+DA:282,1812
+DA:283,1812
+DA:284,1812
+DA:285,1812
+DA:303,43296
+DA:304,43296
+DA:305,43296
+DA:306,43296
+DA:324,1776
+DA:325,1776
+DA:326,1776
+DA:327,1776
+DA:344,2928
+DA:345,2928
+DA:346,2928
+DA:347,2928
+DA:348,2928
+DA:360,58680
+DA:361,58680
+DA:362,58680
+DA:363,58680
+DA:380,5400
+DA:382,5400
+DA:383,2112
+DA:384,2112
+DA:385,2112
+DA:386,0
+DA:388,2112
+DA:389,2112
+DA:392,2112
+DA:393,2112
+DA:394,360
+DA:395,360
+DA:396,360
+DA:397,360
+DA:398,360
+DA:399,360
+DA:400,2928
+DA:401,2928
+DA:402,2928
+DA:403,2928
+DA:404,2928
+DA:406,5400
+DA:422,0
+DA:424,0
+DA:425,0
+DA:441,0
+DA:443,0
+DA:444,0
+DA:460,648
+DA:462,648
+DA:463,648
+DA:479,4752
+DA:481,4752
+DA:482,4752
+DA:498,1536
+DA:499,1536
+DA:500,1536
+DA:501,1536
+DA:514,0
+DA:515,0
+DA:516,0
+DA:533,0
+DA:534,0
+DA:535,0
+DA:537,0
+DA:538,0
+DA:539,0
+DA:552,4032
+DA:553,4032
+DA:554,1704
+DA:555,1704
+DA:557,2328
+DA:558,2328
+DA:559,2328
+DA:572,4428
+DA:573,4428
+DA:574,4428
+DA:575,4428
+DA:591,240
+DA:592,240
+DA:593,240
+DA:594,240
+DA:608,4428
+DA:610,4428
+DA:611,0
+DA:612,0
+DA:613,0
+DA:614,0
+DA:619,4428
+DA:620,4428
+DA:638,0
+DA:639,0
+DA:642,0
+DA:643,0
+DA:645,0
+DA:647,0
+DA:650,0
+DA:651,0
+DA:654,0
+DA:655,0
+DA:668,0
+DA:669,0
+DA:670,0
+DA:671,0
+DA:675,0
+DA:676,0
+DA:677,0
+DA:692,0
+DA:693,0
+DA:694,0
+DA:695,0
+DA:708,384
+DA:709,384
+DA:710,384
+DA:730,63624
+DA:731,63624
+DA:734,63624
+DA:735,63624
+DA:736,63624
+DA:739,2096256
+DA:740,2096256
+DA:741,63624
+DA:742,63624
+DA:744,63624
+DA:745,63624
+DA:764,672
+DA:765,672
+DA:766,672
+DA:779,96
+DA:780,96
+DA:781,96
+DA:793,24420
+DA:794,24420
+DA:795,24420
+DA:808,0
+DA:809,0
+DA:812,0
+DA:813,0
+DA:814,0
+DA:815,0
+DA:816,0
+DA:818,0
+DA:819,0
+DA:820,0
+DA:822,0
+DA:823,0
+DA:825,0
+DA:826,0
+DA:829,0
+DA:842,0
+DA:843,0
+DA:846,0
+DA:847,0
+DA:848,0
+DA:849,0
+DA:850,0
+DA:852,0
+DA:853,0
+DA:854,0
+DA:856,0
+DA:857,0
+DA:859,0
+DA:860,0
+DA:861,0
+DA:863,0
+DA:864,0
+DA:865,0
+DA:866,0
+DA:867,0
+DA:868,0
+DA:869,0
+DA:873,0
+DA:889,0
+DA:890,0
+DA:893,0
+DA:896,0
+DA:897,0
+DA:898,0
+DA:899,0
+DA:900,0
+DA:903,0
+DA:906,0
+DA:907,0
+DA:910,0
+DA:911,0
+DA:915,0
+DA:916,0
+DA:917,0
+DA:918,0
+DA:919,0
+DA:920,0
+DA:921,0
+DA:922,0
+DA:923,0
+DA:925,0
+DA:926,0
+DA:928,0
+DA:929,0
+DA:932,0
+DA:933,0
+DA:934,0
+DA:936,0
+DA:937,0
+DA:950,0
+DA:951,0
+DA:954,0
+DA:955,0
+DA:956,0
+DA:957,0
+DA:958,0
+DA:961,0
+DA:962,0
+DA:963,0
+DA:964,0
+DA:965,0
+DA:967,0
+DA:968,0
+DA:989,48
+DA:992,48
+DA:993,48
+DA:994,48
+DA:995,48
+DA:996,48
+DA:997,48
+DA:1013,0
+DA:1016,0
+DA:1017,0
+DA:1018,0
+DA:1019,0
+DA:1020,0
+DA:1021,0
+DA:1034,48
+DA:1037,48
+DA:1038,48
+DA:1039,48
+DA:1040,48
+DA:1041,48
+DA:1054,0
+DA:1057,0
+DA:1058,0
+DA:1059,0
+DA:1060,0
+DA:1061,0
+DA:1077,0
+DA:1080,0
+DA:1081,0
+DA:1082,0
+DA:1083,0
+DA:1084,0
+DA:1085,0
+DA:1098,0
+DA:1101,0
+DA:1102,0
+DA:1103,0
+DA:1104,0
+DA:1105,0
+DA:1171,432
+DA:1172,432
+DA:1175,432
+DA:1176,432
+DA:1179,432
+DA:1180,432
+DA:1181,432
+DA:1182,432
+DA:1183,0
+DA:1185,0
+DA:1186,0
+DA:1188,0
+DA:1190,0
+DA:1191,0
+DA:1193,432
+DA:1197,432
+DA:1198,9504
+DA:1199,10368
+DA:1200,9936
+DA:1201,9936
+DA:1202,65520
+DA:1203,9936
+DA:1204,9936
+DA:1205,864
+DA:1206,864
+DA:1207,864
+DA:1211,432
+DA:1249,432
+DA:1250,432
+DA:1251,432
+DA:1252,432
+DA:1253,432
+DA:1254,432
+DA:1255,432
+DA:1256,0
+DA:1257,0
+DA:1258,432
+DA:1259,432
+DA:1262,432
+DA:1351,432
+DA:1352,432
+DA:1355,432
+DA:1358,432
+DA:1362,432
+DA:1366,432
+DA:1368,432
+DA:1369,0
+DA:1371,432
+DA:1375,432
+DA:1376,432
+DA:1389,0
+DA:1390,0
+DA:1391,0
+DA:1392,0
+DA:1395,0
+DA:1397,0
+DA:1398,0
+DA:1399,0
+DA:1401,0
+DA:1419,24420
+DA:1420,24420
+DA:1421,24420
+DA:1422,24420
+DA:1423,24420
+DA:1436,648
+DA:1437,648
+DA:1438,648
+DA:1451,0
+DA:1452,0
+DA:1453,0
+DA:1456,0
+DA:1458,0
+DA:1459,0
+DA:1461,0
+DA:1463,0
+DA:1465,0
+DA:1478,0
+DA:1479,0
+DA:1480,0
+DA:1493,432
+DA:1496,432
+DA:1497,432
+DA:1499,432
+DA:1500,432
+DA:1502,432
+DA:1503,432
+DA:1504,432
+DA:1505,432
+DA:1507,432
+DA:1508,432
+DA:1509,432
+DA:1510,432
+DA:1511,432
+DA:1524,0
+DA:1527,0
+DA:1528,0
+DA:1530,0
+DA:1531,0
+DA:1533,0
+DA:1534,0
+DA:1535,0
+DA:1536,0
+DA:1538,0
+DA:1539,0
+DA:1540,0
+DA:1541,0
+DA:1542,0
+DA:1543,0
+DA:1544,0
+DA:1557,0
+DA:1560,0
+DA:1561,0
+DA:1563,0
+DA:1564,0
+DA:1566,0
+DA:1567,0
+DA:1568,0
+DA:1569,0
+DA:1571,0
+DA:1572,0
+DA:1573,0
+DA:1574,0
+DA:1575,0
+DA:1588,0
+DA:1589,0
+DA:1590,0
+DA:1603,0
+DA:1604,0
+DA:1605,0
+DA:1618,0
+DA:1619,0
+DA:1622,0
+DA:1625,0
+DA:1627,0
+DA:1628,0
+DA:1629,0
+DA:1631,0
+DA:1635,0
+DA:1636,0
+DA:1637,0
+DA:1649,48480
+DA:1650,48480
+DA:1651,48180
+DA:1652,48180
+DA:1655,300
+DA:1657,300
+DA:1659,300
+DA:1661,300
+DA:1662,0
+DA:1663,0
+DA:1664,0
+DA:1666,0
+DA:1669,300
+DA:1671,600
+DA:1672,300
+DA:1674,300
+DA:1676,300
+DA:1677,0
+DA:1679,300
+DA:1681,300
+DA:1682,0
+DA:1684,300
+DA:1686,300
+DA:1687,300
+DA:1688,300
+DA:1689,300
+DA:1690,300
+DA:1691,300
+DA:1692,300
+DA:1711,0
+DA:1715,0
+DA:1716,0
+DA:1717,0
+DA:1800,0
+DA:1801,0
+DA:1803,0
+DA:1804,0
+DA:1805,0
+DA:1819,0
+DA:1820,0
+DA:1821,0
+DA:1822,0
+DA:1823,0
+DA:1838,0
+DA:1839,0
+DA:1840,0
+DA:1841,0
+DA:1856,0
+DA:1857,0
+DA:1858,0
+DA:1859,0
+DA:1860,0
+DA:1882,0
+DA:1883,0
+DA:1884,0
+DA:1885,0
+DA:1886,0
+DA:1887,0
+DA:1899,0
+DA:1900,0
+DA:1901,0
+LF:493
+LH:205
+end_of_record
+TN:
+SF:/usr/include/valgrind/valgrind.h
+FNL:0,7293,7322
+FNA:0,0,VALGRIND_PRINTF
+FNL:1,7332,7361
+FNA:1,0,VALGRIND_PRINTF_BACKTRACE
+FNF:2
+FNH:0
+DA:7293,0
+DA:7305,0
+DA:7313,0
+DA:7319,0
+DA:7320,0
+DA:7332,0
+DA:7344,0
+DA:7352,0
+DA:7358,0
+DA:7359,0
+LF:10
+LH:0
+end_of_record
diff --git a/doc/img/libCEEDBackends.svg b/doc/img/libCEEDBackends.svg
index cff3b2527a..d8e96bb13b 100644
--- a/doc/img/libCEEDBackends.svg
+++ b/doc/img/libCEEDBackends.svg
@@ -1,1862 +1,1128 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="349.31" height="250.054" viewBox="0 0 349.31 250.054">
-<defs>
-<g>
-<g id="glyph-0-0">
-<path d="M 4.859375 -3.28125 C 4.46875 -3.28125 3.96875 -3.265625 3.625 -3.265625 C 3.28125 -3.265625 2.625 -3.28125 2.359375 -3.28125 L 3.640625 -6.234375 L 3.65625 -6.234375 C 4.140625 -5.109375 4.546875 -4.125 4.859375 -3.28125 Z M 2.140625 -2.71875 C 2.4375 -2.75 3.265625 -2.765625 3.6875 -2.765625 C 4.140625 -2.765625 4.796875 -2.75 5.078125 -2.71875 C 5.671875 -1.15625 5.921875 -0.234375 6.015625 0.03125 C 6.21875 0 6.40625 0 6.609375 0 C 6.8125 0 7.125 0 7.328125 0.03125 C 6.734375 -1.171875 5.15625 -5.25 4.046875 -7.859375 L 3.71875 -7.859375 C 2.5625 -5.21875 1.40625 -2.609375 0.1875 0.03125 C 0.328125 0 0.484375 0 0.609375 0 C 0.734375 0 1 0 1.140625 0.03125 C 1.34375 -0.6875 1.703125 -1.671875 2.140625 -2.71875 Z M 2.140625 -2.71875 "/>
-</g>
-<g id="glyph-0-1">
-<path d="M 1.9375 -3.515625 C 2.203125 -4.140625 2.90625 -4.703125 3.375 -4.703125 C 4.1875 -4.703125 4.609375 -3.953125 4.609375 -2.609375 C 4.609375 -1.625 4.328125 -0.359375 3.046875 -0.359375 C 2.859375 -0.359375 2.359375 -0.40625 1.9375 -0.890625 Z M 1.9375 -4.203125 L 1.9375 -5.078125 C 1.9375 -5.15625 1.90625 -5.203125 1.859375 -5.203125 C 1.59375 -5.140625 1.15625 -5.125 0.890625 -5.15625 L 0.875 -5.125 C 0.953125 -4.609375 0.984375 -3.703125 0.984375 -2.8125 L 0.984375 0.375 C 0.984375 1.28125 0.953125 2.140625 0.875 2.765625 L 0.890625 2.8125 C 1.046875 2.78125 1.3125 2.765625 1.453125 2.765625 C 1.609375 2.765625 1.875 2.78125 2.015625 2.8125 L 2.046875 2.765625 C 1.953125 2.09375 1.9375 1.296875 1.9375 0.375 L 1.9375 -0.140625 C 2.25 0.03125 2.6875 0.125 3.125 0.125 C 4.703125 0.125 5.65625 -1.078125 5.65625 -2.765625 C 5.65625 -4.03125 4.90625 -5.25 3.578125 -5.25 C 3.109375 -5.25 2.515625 -5.015625 1.953125 -4.171875 Z M 1.9375 -4.203125 "/>
-</g>
-<g id="glyph-0-2">
-<path d="M 1.078125 -2.390625 C 1.078125 -1.5 1.046875 -0.640625 0.96875 0 L 0.984375 0.03125 C 1.140625 0.015625 1.40625 0 1.546875 0 C 1.703125 0 1.96875 0.015625 2.109375 0.03125 L 2.140625 0 C 2.046875 -0.6875 2.03125 -1.484375 2.03125 -2.390625 L 2.03125 -5.96875 C 2.03125 -6.875 2.078125 -7.453125 2.140625 -8.21875 C 2.140625 -8.3125 2.109375 -8.34375 2.03125 -8.34375 C 1.71875 -8.21875 1.46875 -8.140625 0.984375 -8.109375 L 0.96875 -8.0625 C 1.046875 -7.546875 1.078125 -6.671875 1.078125 -5.765625 Z M 1.078125 -2.390625 "/>
-</g>
-<g id="glyph-0-3">
-<path d="M 0.953125 -7.140625 C 0.953125 -6.8125 1.25 -6.53125 1.5625 -6.53125 C 1.890625 -6.53125 2.171875 -6.8125 2.171875 -7.140625 C 2.171875 -7.453125 1.890625 -7.75 1.5625 -7.75 C 1.25 -7.75 0.953125 -7.453125 0.953125 -7.140625 Z M 1.078125 -2.8125 L 1.078125 -2.15625 C 1.078125 -1.25 1.046875 -0.640625 0.96875 0 L 0.984375 0.03125 C 1.140625 0.015625 1.40625 0 1.546875 0 C 1.703125 0 1.96875 0.015625 2.109375 0.03125 L 2.140625 0 C 2.046875 -0.671875 2.03125 -1.25 2.03125 -2.15625 L 2.03125 -3 C 2.03125 -3.890625 2.0625 -4.328125 2.140625 -5.0625 C 2.140625 -5.15625 2.109375 -5.171875 2.03125 -5.171875 C 1.75 -5.140625 1.25 -5.140625 0.984375 -5.15625 L 0.96875 -5.125 C 1.046875 -4.609375 1.078125 -3.703125 1.078125 -2.8125 Z M 1.078125 -2.8125 "/>
-</g>
-<g id="glyph-0-4">
-<path d="M 2.984375 -4.765625 C 3.4375 -4.765625 3.859375 -4.421875 4.1875 -3.890625 L 4.34375 -3.890625 L 4.609375 -4.875 L 4.59375 -4.90625 C 4.21875 -5.109375 3.59375 -5.25 3 -5.25 C 1.75 -5.25 0.453125 -4.234375 0.453125 -2.609375 C 0.453125 -0.9375 1.40625 0.125 2.859375 0.125 C 3.578125 0.125 4.140625 -0.125 4.609375 -0.734375 L 4.390625 -0.984375 L 4.34375 -0.984375 C 3.890625 -0.578125 3.5 -0.484375 3.0625 -0.484375 C 2.1875 -0.484375 1.5 -1.265625 1.5 -2.671875 C 1.5 -3.984375 2.203125 -4.765625 2.984375 -4.765625 Z M 2.984375 -4.765625 "/>
-</g>
-<g id="glyph-0-5">
-<path d="M 3.703125 -2.6875 L 3.65625 -1.265625 C 3.65625 -1.109375 3.578125 -1.03125 3.484375 -0.953125 C 3.15625 -0.703125 2.75 -0.484375 2.390625 -0.484375 C 1.859375 -0.484375 1.5 -0.84375 1.5 -1.21875 C 1.5 -1.75 1.75 -2.15625 2.6875 -2.421875 Z M 3.703125 -0.578125 C 3.84375 -0.09375 4.203125 0.125 4.640625 0.125 C 4.9375 0.125 5.3125 0.046875 5.578125 -0.25 L 5.5 -0.53125 C 5.359375 -0.484375 5.265625 -0.484375 5.171875 -0.484375 C 5.0625 -0.484375 4.90625 -0.5 4.8125 -0.578125 C 4.703125 -0.6875 4.625 -0.96875 4.625 -1.5 C 4.625 -1.859375 4.65625 -3.171875 4.65625 -3.3125 C 4.65625 -4.921875 3.59375 -5.25 2.6875 -5.25 C 1.78125 -5.25 1.234375 -4.8125 0.9375 -4.546875 L 0.890625 -4.5 L 1.09375 -3.703125 L 1.25 -3.6875 C 1.59375 -4.234375 2 -4.703125 2.5625 -4.703125 C 3 -4.703125 3.71875 -4.65625 3.71875 -3.296875 C 3.71875 -3.21875 3.671875 -3.171875 3.640625 -3.15625 L 2.515625 -2.90625 C 1.296875 -2.625 0.53125 -1.984375 0.53125 -1.171875 C 0.53125 -0.28125 1.140625 0.125 2.03125 0.125 C 2.6875 0.125 3.03125 -0.03125 3.65625 -0.578125 Z M 3.703125 -0.578125 "/>
-</g>
-<g id="glyph-0-6">
-<path d="M 1.140625 -5.125 C 0.890625 -5.125 0.640625 -5.125 0.484375 -5.15625 C 0.40625 -4.953125 0.328125 -4.84375 0.234375 -4.65625 L 0.28125 -4.578125 C 0.484375 -4.59375 0.859375 -4.59375 1.140625 -4.609375 L 1.140625 -2.96875 C 1.140625 -2.25 1.09375 -1.390625 1.09375 -1.03125 C 1.09375 -0.25 1.609375 0.125 2.15625 0.125 C 2.65625 0.125 3.03125 0 3.53125 -0.328125 L 3.375 -0.609375 C 3.03125 -0.484375 2.75 -0.484375 2.453125 -0.53125 C 2.15625 -0.5625 2.0625 -0.84375 2.0625 -1.5 C 2.0625 -1.859375 2.09375 -2.375 2.09375 -3.09375 L 2.09375 -4.609375 L 2.546875 -4.609375 C 2.828125 -4.609375 3.234375 -4.59375 3.40625 -4.578125 C 3.4375 -4.765625 3.484375 -4.890625 3.546875 -5.0625 L 3.484375 -5.15625 C 3.28125 -5.140625 2.921875 -5.125 2.65625 -5.125 L 2.09375 -5.125 C 2.09375 -6.015625 2.09375 -6.1875 2.15625 -6.875 C 2.15625 -6.953125 2.109375 -7 2.046875 -7 C 1.734375 -6.875 1.59375 -6.734375 1.203125 -6.6875 L 1.1875 -6.65625 C 1.15625 -6.234375 1.140625 -5.828125 1.140625 -5.125 Z M 1.140625 -5.125 "/>
-</g>
-<g id="glyph-0-7">
-<path d="M 0.453125 -2.453125 C 0.453125 -1.015625 1.40625 0.125 2.984375 0.125 C 4.5625 0.125 5.515625 -0.984375 5.515625 -2.5625 C 5.515625 -4.1875 4.65625 -5.25 3.03125 -5.25 C 1.46875 -5.25 0.453125 -4.140625 0.453125 -2.453125 Z M 2.953125 -4.765625 C 4.21875 -4.765625 4.46875 -3.796875 4.46875 -2.34375 C 4.46875 -1.1875 3.890625 -0.359375 3.09375 -0.359375 C 1.828125 -0.359375 1.5 -1.734375 1.5 -2.65625 C 1.5 -3.6875 1.828125 -4.765625 2.953125 -4.765625 Z M 2.953125 -4.765625 "/>
-</g>
-<g id="glyph-0-8">
-<path d="M 5.5 -2.15625 C 5.5 -2.53125 5.515625 -2.96875 5.515625 -3.359375 C 5.515625 -4.609375 5.109375 -5.25 4.046875 -5.25 C 3.578125 -5.25 2.734375 -5.0625 1.96875 -4.171875 L 1.953125 -4.203125 L 1.953125 -5.078125 C 1.9375 -5.15625 1.921875 -5.203125 1.875 -5.203125 C 1.609375 -5.140625 1.171875 -5.125 0.90625 -5.15625 L 0.890625 -5.125 C 0.96875 -4.609375 0.984375 -3.703125 0.984375 -2.8125 L 0.984375 -2.15625 C 0.984375 -1.25 0.984375 -0.640625 0.890625 0 L 0.90625 0.03125 C 1.046875 0.015625 1.328125 0 1.46875 0 C 1.609375 0 1.890625 0.015625 2.03125 0.03125 L 2.0625 0 C 1.953125 -0.6875 1.953125 -1.25 1.953125 -2.15625 L 1.953125 -3.546875 C 2.5625 -4.265625 3.25 -4.53125 3.6875 -4.53125 C 4.296875 -4.53125 4.546875 -4.296875 4.546875 -3.328125 L 4.546875 -2.15625 C 4.546875 -1.25 4.515625 -0.640625 4.4375 0 L 4.453125 0.03125 C 4.609375 0.015625 4.875 0 5.015625 0 C 5.15625 0 5.4375 0.015625 5.578125 0.03125 L 5.609375 0 C 5.515625 -0.6875 5.5 -1.25 5.5 -2.15625 Z M 5.5 -2.15625 "/>
-</g>
-<g id="glyph-0-9">
-<path d="M 2.28125 -5.3125 C 2.28125 -6.21875 2.3125 -7.078125 2.40625 -7.703125 L 2.390625 -7.75 C 2.25 -7.71875 1.90625 -7.703125 1.765625 -7.703125 C 1.625 -7.703125 1.296875 -7.71875 1.15625 -7.75 L 1.140625 -7.703125 C 1.234375 -7.03125 1.25 -6.21875 1.25 -5.3125 L 1.25 -2.390625 C 1.25 -1.5 1.234375 -0.640625 1.140625 0 L 1.140625 0.03125 C 1.140625 0.03125 1.359375 0 1.765625 0 L 5.03125 0 C 5.28125 0 5.5625 0.015625 5.75 0.03125 L 5.765625 0 C 5.765625 -0.171875 5.75 -0.40625 5.75 -0.53125 C 5.75 -0.640625 5.765625 -0.84375 5.765625 -0.9375 L 5.75 -0.984375 C 5.75 -0.984375 3.984375 -0.734375 2.375 -0.734375 C 2.296875 -1.015625 2.28125 -2.1875 2.28125 -2.390625 Z M 2.28125 -5.3125 "/>
-</g>
-<g id="glyph-0-10">
-<path d="M 1.9375 -0.96875 L 1.9375 -4.0625 C 2.40625 -4.546875 2.703125 -4.765625 3.1875 -4.765625 C 3.921875 -4.765625 4.5625 -4.15625 4.5625 -2.671875 C 4.5625 -1.1875 4.125 -0.359375 2.984375 -0.359375 C 2.625 -0.359375 2.171875 -0.671875 1.9375 -0.96875 Z M 1.9375 -5.96875 C 1.9375 -6.859375 1.953125 -7.484375 2.015625 -8.21875 C 2.015625 -8.3125 1.96875 -8.34375 1.90625 -8.34375 C 1.59375 -8.21875 1.34375 -8.140625 0.859375 -8.109375 L 0.84375 -8.0625 C 0.921875 -7.546875 0.984375 -6.671875 0.984375 -5.765625 L 0.984375 -0.875 C 0.984375 -0.40625 0.96875 -0.234375 0.921875 0.03125 C 0.984375 0.09375 1.125 0.125 1.25 0.125 C 1.390625 -0.03125 1.515625 -0.234375 1.65625 -0.46875 C 2 -0.171875 2.546875 0.125 3.0625 0.125 C 4.28125 0.125 5.625 -0.734375 5.625 -2.765625 C 5.625 -4.234375 4.59375 -5.25 3.4375 -5.25 C 2.859375 -5.25 2.34375 -5.09375 1.9375 -4.65625 Z M 1.9375 -5.96875 "/>
-</g>
-<g id="glyph-0-11">
-<path d="M 2 -4 L 2 -5.078125 C 2 -5.15625 1.96875 -5.203125 1.921875 -5.203125 C 1.65625 -5.140625 1.21875 -5.125 0.953125 -5.15625 L 0.9375 -5.125 C 1.015625 -4.609375 1.046875 -3.703125 1.046875 -2.8125 L 1.046875 -2.15625 C 1.046875 -1.25 1.015625 -0.640625 0.9375 0 L 0.953125 0.03125 C 1.09375 0.015625 1.375 0 1.515625 0 C 1.65625 0 1.9375 0.015625 2.078125 0.03125 L 2.109375 0 C 2.015625 -0.6875 2 -1.25 2 -2.15625 L 2 -2.765625 C 2 -3.28125 2.140625 -3.5625 2.40625 -3.953125 C 2.5625 -4.21875 2.84375 -4.375 3.078125 -4.375 C 3.3125 -4.375 3.53125 -4.34375 3.6875 -4.203125 L 3.796875 -4.234375 L 4.03125 -5.109375 L 3.984375 -5.15625 C 3.78125 -5.21875 3.765625 -5.25 3.546875 -5.25 C 2.90625 -5.25 2.5625 -4.875 2.03125 -3.96875 Z M 2 -4 "/>
-</g>
-<g id="glyph-0-12">
-<path d="M 0.328125 -5.15625 C 0.796875 -4.09375 2.21875 -1.015625 2.65625 0.125 C 2.265625 1.015625 1.78125 1.9375 1.25 2.828125 C 1.359375 2.796875 1.546875 2.765625 1.65625 2.765625 C 1.78125 2.765625 2.125 2.796875 2.25 2.828125 C 2.703125 1.34375 5.3125 -4.359375 5.71875 -5.15625 C 5.59375 -5.125 5.3125 -5.125 5.203125 -5.125 C 5.078125 -5.125 4.859375 -5.125 4.734375 -5.15625 C 4.296875 -3.8125 3.734375 -2.28125 3.15625 -1.046875 L 3.125 -1.046875 C 2.546875 -2.46875 1.984375 -3.796875 1.546875 -5.15625 C 1.390625 -5.125 1.125 -5.125 0.953125 -5.125 C 0.796875 -5.125 0.5 -5.125 0.328125 -5.15625 Z M 0.328125 -5.15625 "/>
-</g>
-<g id="glyph-0-13">
-<path d="M 2.265625 -6.6875 C 2.265625 -7.078125 2.390625 -7.234375 3.359375 -7.234375 C 4.03125 -7.234375 4.875 -6.96875 4.875 -5.75 C 4.875 -4.734375 4.203125 -4.4375 3.203125 -4.4375 L 2.265625 -4.4375 Z M 2.265625 -3.921875 L 3.375 -3.921875 C 4.78125 -3.921875 5.4375 -3.046875 5.4375 -1.96875 C 5.4375 -1.140625 5.15625 -0.484375 3.4375 -0.484375 C 2.59375 -0.484375 2.265625 -0.65625 2.265625 -1.046875 Z M 1.75 -7.703125 C 1.34375 -7.703125 1.140625 -7.75 1.140625 -7.75 L 1.125 -7.703125 C 1.21875 -7.03125 1.25 -6.21875 1.25 -5.3125 L 1.25 -2.390625 C 1.25 -1.5 1.21875 -0.640625 1.125 0 L 1.140625 0.03125 C 1.140625 0.03125 1.34375 0 1.75 0 C 2.5 0 2.59375 0.03125 3.65625 0.03125 C 6.03125 0.03125 6.625 -1.171875 6.625 -2.203125 C 6.625 -3.34375 5.84375 -4 4.8125 -4.296875 C 5.421875 -4.609375 5.921875 -5.234375 5.921875 -5.890625 C 5.921875 -6.6875 5.5 -7.75 3.28125 -7.75 C 2.859375 -7.75 2.359375 -7.703125 1.75 -7.703125 Z M 1.75 -7.703125 "/>
-</g>
-<g id="glyph-0-14">
-<path d="M 1.078125 -2.390625 C 1.078125 -1.5 1.046875 -0.640625 0.96875 0 L 0.984375 0.03125 C 1.140625 0.015625 1.40625 0 1.546875 0 C 1.703125 0 1.96875 0.015625 2.109375 0.03125 L 2.140625 0 C 2.046875 -0.6875 2.03125 -1.484375 2.03125 -2.390625 L 2.03125 -2.515625 C 2.125 -2.5 2.421875 -2.46875 2.515625 -2.375 C 3.296875 -1.5625 3.578125 -1.140625 4.40625 0.03125 C 4.578125 0.03125 4.984375 0 5.1875 0 C 5.375 0 5.796875 0.03125 5.90625 0.03125 L 5.921875 0 C 5.03125 -0.9375 4.40625 -1.453125 3.265625 -2.84375 C 3.734375 -3.328125 4.90625 -4.453125 5.671875 -5.125 L 5.65625 -5.15625 C 5.421875 -5.125 4.765625 -5.125 4.453125 -5.125 C 3.921875 -4.375 3.03125 -3.40625 2.5625 -3.046875 C 2.40625 -2.921875 2.1875 -2.875 2.03125 -2.875 L 2.03125 -5.96875 C 2.03125 -6.875 2.078125 -7.453125 2.140625 -8.21875 C 2.140625 -8.3125 2.109375 -8.34375 2.03125 -8.34375 C 1.71875 -8.21875 1.46875 -8.140625 0.984375 -8.109375 L 0.96875 -8.0625 C 1.046875 -7.546875 1.078125 -6.671875 1.078125 -5.765625 Z M 1.078125 -2.390625 "/>
-</g>
-<g id="glyph-0-15">
-<path d="M 1.5 -3.265625 C 1.703125 -4.5 2.46875 -4.765625 2.859375 -4.765625 C 3.328125 -4.765625 3.859375 -4.328125 3.859375 -3.4375 C 3.859375 -3.328125 3.8125 -3.265625 3.6875 -3.265625 Z M 4.6875 -1.25 C 4.265625 -0.78125 3.703125 -0.578125 3.046875 -0.578125 C 2.625 -0.578125 2.0625 -0.734375 1.75 -1.265625 C 1.53125 -1.609375 1.453125 -2.078125 1.453125 -2.78125 L 4.703125 -2.78125 C 4.84375 -2.78125 4.921875 -2.859375 4.921875 -2.984375 C 4.921875 -4 4.4375 -5.25 2.859375 -5.25 C 1.625 -5.25 0.40625 -4.25 0.40625 -2.5 C 0.40625 -1.8125 0.53125 -1.140625 0.9375 -0.671875 C 1.34375 -0.171875 2.03125 0.125 2.84375 0.125 C 3.703125 0.125 4.46875 -0.328125 4.921875 -0.9375 Z M 4.6875 -1.25 "/>
-</g>
-<g id="glyph-0-16">
-<path d="M 4.3125 -1.34375 C 3.875 -0.78125 3.328125 -0.5 2.84375 -0.5 C 2.203125 -0.5 1.65625 -1.140625 1.65625 -2.640625 C 1.65625 -4.4375 2.59375 -4.765625 3.171875 -4.765625 C 3.734375 -4.765625 4.03125 -4.53125 4.3125 -4.078125 Z M 4.3125 -0.6875 L 4.34375 -0.6875 L 4.40625 0 C 4.40625 0.03125 4.4375 0.03125 4.5 0.03125 C 4.65625 0.03125 4.75 0 4.90625 0 C 5.0625 0 5.296875 0.015625 5.453125 0.03125 L 5.46875 0 C 5.375 -0.515625 5.265625 -1.40625 5.265625 -2.296875 L 5.265625 -5.96875 C 5.265625 -6.859375 5.3125 -7.484375 5.375 -8.21875 C 5.375 -8.3125 5.34375 -8.34375 5.265625 -8.34375 C 4.953125 -8.21875 4.703125 -8.140625 4.234375 -8.109375 L 4.203125 -8.0625 C 4.296875 -7.546875 4.3125 -6.671875 4.3125 -5.765625 L 4.3125 -4.984375 C 4.046875 -5.140625 3.546875 -5.25 3.3125 -5.25 C 1.75 -5.25 0.609375 -4.171875 0.609375 -2.546875 C 0.609375 -1.078125 1.484375 0.125 2.765625 0.125 C 3.34375 0.125 3.890625 -0.125 4.3125 -0.6875 Z M 4.3125 -0.6875 "/>
-</g>
-<g id="glyph-0-17">
-<path d="M 0.59375 -1.203125 L 0.390625 -0.203125 C 1.078125 0.03125 1.75 0.125 2.203125 0.125 C 3.84375 0.125 4.234375 -0.859375 4.234375 -1.515625 C 4.234375 -2.5 3.453125 -2.875 2.609375 -3.078125 C 2.15625 -3.1875 1.5 -3.421875 1.5 -4.015625 C 1.5 -4.5 1.890625 -4.765625 2.40625 -4.765625 C 3.03125 -4.765625 3.40625 -4.28125 3.6875 -3.953125 L 3.84375 -3.96875 L 4.09375 -4.859375 L 4.0625 -4.890625 C 3.71875 -5.0625 3.0625 -5.25 2.453125 -5.25 C 1.5625 -5.25 0.640625 -4.78125 0.640625 -3.796875 C 0.640625 -2.828125 1.34375 -2.515625 2.078125 -2.3125 C 2.765625 -2.125 3.28125 -1.9375 3.28125 -1.34375 C 3.28125 -0.71875 2.828125 -0.359375 2.1875 -0.359375 C 1.609375 -0.359375 1.109375 -0.75 0.78125 -1.21875 Z M 0.59375 -1.203125 "/>
-</g>
-<g id="glyph-0-18">
-<path d="M 6.40625 -5.3125 L 6.40625 -4.359375 L 2.265625 -4.359375 L 2.265625 -5.3125 C 2.265625 -6.21875 2.296875 -7.078125 2.390625 -7.703125 L 2.375 -7.75 C 2.234375 -7.71875 1.90625 -7.703125 1.75 -7.703125 C 1.609375 -7.703125 1.296875 -7.71875 1.140625 -7.75 L 1.125 -7.703125 C 1.21875 -7.03125 1.25 -6.21875 1.25 -5.3125 L 1.25 -2.390625 C 1.25 -1.5 1.21875 -0.640625 1.125 0 L 1.140625 0.03125 C 1.28125 0.015625 1.609375 0 1.75 0 C 1.90625 0 2.21875 0.015625 2.359375 0.03125 L 2.390625 0 C 2.296875 -0.6875 2.265625 -1.5 2.265625 -2.390625 L 2.265625 -3.84375 L 6.40625 -3.84375 L 6.40625 -2.390625 C 6.40625 -1.5 6.390625 -0.640625 6.28125 0 L 6.296875 0.03125 C 6.4375 0.015625 6.78125 0 6.921875 0 C 7.0625 0 7.390625 0.015625 7.53125 0.03125 L 7.546875 0 C 7.453125 -0.6875 7.4375 -1.5 7.4375 -2.390625 L 7.4375 -5.3125 C 7.4375 -6.21875 7.453125 -7.078125 7.546875 -7.703125 L 7.546875 -7.75 C 7.40625 -7.71875 7.0625 -7.703125 6.921875 -7.703125 C 6.78125 -7.703125 6.453125 -7.71875 6.3125 -7.75 L 6.28125 -7.703125 C 6.390625 -7.03125 6.40625 -6.21875 6.40625 -5.3125 Z M 6.40625 -5.3125 "/>
-</g>
-<g id="glyph-0-19">
-<path d="M 2.453125 0.046875 C 2.5625 0.015625 2.71875 0.015625 2.84375 0.015625 C 2.96875 0.015625 3.171875 0.015625 3.28125 0.046875 C 3.59375 -0.859375 3.9375 -1.859375 4.328125 -2.78125 C 4.6875 -1.859375 5.0625 -0.890625 5.421875 0.03125 C 5.53125 0 5.609375 0 5.71875 0 C 5.84375 0 6.0625 0 6.171875 0.03125 C 6.75 -1.375 7.703125 -3.671875 8.390625 -5.15625 C 8.28125 -5.125 8 -5.125 7.875 -5.125 C 7.75 -5.125 7.578125 -5.125 7.46875 -5.15625 C 7.03125 -3.75 6.484375 -2.21875 5.984375 -1.078125 L 5.890625 -1.078125 C 5.40625 -2.390625 5.03125 -3.890625 4.75 -5.15625 C 4.609375 -5.125 4.328125 -5.125 4.15625 -5.125 C 3.96875 -5.125 3.65625 -5.125 3.46875 -5.15625 C 3.625 -4.671875 3.8125 -4.140625 4.015625 -3.59375 C 3.703125 -2.703125 3.375 -1.8125 3.03125 -1.0625 L 2.953125 -1.0625 C 2.359375 -2.421875 1.953125 -3.78125 1.53125 -5.15625 C 1.375 -5.109375 1.09375 -5.109375 0.9375 -5.109375 C 0.734375 -5.109375 0.4375 -5.109375 0.234375 -5.15625 C 1.046875 -3.390625 1.78125 -1.703125 2.453125 0.046875 Z M 2.453125 0.046875 "/>
-</g>
-<g id="glyph-1-0">
-<path d="M 1.890625 -2.84375 C 2.25 -2.84375 2.546875 -2.859375 2.8125 -2.875 C 3.09375 -2.546875 3.3125 -2.15625 3.5625 -1.75 C 3.921875 -1.171875 4.40625 -0.3125 4.515625 0.03125 C 4.703125 0 4.921875 0 5.109375 0 C 5.3125 0 5.515625 0 5.71875 0.03125 L 5.734375 0 C 5.234375 -0.5625 4.234375 -2.140625 3.59375 -2.984375 C 3.875 -3.03125 4.0625 -3.09375 4.203125 -3.171875 C 4.75 -3.4375 5.203125 -3.953125 5.203125 -4.78125 C 5.203125 -5.28125 5.03125 -5.6875 4.671875 -6.015625 C 4.203125 -6.46875 3.4375 -6.5 2.8125 -6.5 C 2.5625 -6.5 1.765625 -6.4375 1.46875 -6.4375 C 1.34375 -6.4375 1.078125 -6.4375 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 1.078125 0.015625 1.34375 0 1.46875 0 C 1.578125 0 1.859375 0.015625 1.96875 0.03125 L 2 0 C 1.90625 -0.5625 1.890625 -1.25 1.890625 -2 Z M 2.828125 -6.078125 C 3.421875 -6.078125 4.25 -5.84375 4.25 -4.765625 C 4.25 -3.546875 3.4375 -3.25 2.46875 -3.25 L 1.890625 -3.25 L 1.890625 -5.625 C 1.890625 -6.015625 1.921875 -6.078125 2.828125 -6.078125 Z M 2.828125 -6.078125 "/>
-</g>
-<g id="glyph-1-1">
-<path d="M 3.09375 -2.25 L 3.046875 -1.0625 C 3.046875 -0.921875 2.984375 -0.859375 2.90625 -0.796875 C 2.625 -0.59375 2.296875 -0.40625 2 -0.40625 C 1.546875 -0.40625 1.25 -0.703125 1.25 -1.015625 C 1.25 -1.46875 1.46875 -1.796875 2.25 -2.015625 Z M 3.09375 -0.484375 C 3.203125 -0.078125 3.5 0.09375 3.875 0.09375 C 4.109375 0.09375 4.421875 0.046875 4.65625 -0.203125 L 4.578125 -0.453125 C 4.46875 -0.40625 4.390625 -0.40625 4.3125 -0.40625 C 4.21875 -0.40625 4.09375 -0.421875 4.015625 -0.484375 C 3.921875 -0.5625 3.859375 -0.8125 3.859375 -1.25 C 3.859375 -1.546875 3.890625 -2.65625 3.890625 -2.765625 C 3.890625 -4.109375 3 -4.375 2.25 -4.375 C 1.484375 -4.375 1.03125 -4 0.78125 -3.796875 L 0.75 -3.765625 L 0.921875 -3.09375 L 1.046875 -3.078125 C 1.328125 -3.53125 1.671875 -3.921875 2.140625 -3.921875 C 2.5 -3.921875 3.09375 -3.875 3.09375 -2.75 C 3.09375 -2.6875 3.0625 -2.640625 3.03125 -2.625 L 2.109375 -2.421875 C 1.09375 -2.1875 0.453125 -1.65625 0.453125 -0.984375 C 0.453125 -0.234375 0.953125 0.09375 1.6875 0.09375 C 2.25 0.09375 2.515625 -0.03125 3.046875 -0.484375 Z M 3.09375 -0.484375 "/>
-</g>
-<g id="glyph-1-2">
-<path d="M 0.953125 -4.28125 C 0.734375 -4.28125 0.53125 -4.28125 0.40625 -4.296875 C 0.34375 -4.125 0.28125 -4.03125 0.1875 -3.890625 L 0.234375 -3.8125 C 0.40625 -3.828125 0.71875 -3.828125 0.953125 -3.84375 L 0.953125 -2.46875 C 0.953125 -1.875 0.921875 -1.15625 0.921875 -0.859375 C 0.921875 -0.203125 1.328125 0.09375 1.796875 0.09375 C 2.21875 0.09375 2.53125 0 2.953125 -0.265625 L 2.828125 -0.515625 C 2.515625 -0.40625 2.296875 -0.40625 2.046875 -0.4375 C 1.796875 -0.46875 1.71875 -0.703125 1.71875 -1.25 C 1.71875 -1.5625 1.75 -1.984375 1.75 -2.578125 L 1.75 -3.84375 L 2.125 -3.84375 C 2.359375 -3.84375 2.703125 -3.828125 2.84375 -3.8125 C 2.875 -3.984375 2.90625 -4.078125 2.953125 -4.234375 L 2.90625 -4.296875 C 2.734375 -4.28125 2.4375 -4.28125 2.21875 -4.28125 L 1.75 -4.28125 C 1.75 -5.015625 1.75 -5.15625 1.796875 -5.734375 C 1.796875 -5.796875 1.765625 -5.828125 1.703125 -5.828125 C 1.453125 -5.734375 1.328125 -5.625 1 -5.578125 L 0.984375 -5.546875 C 0.96875 -5.203125 0.953125 -4.859375 0.953125 -4.28125 Z M 0.953125 -4.28125 "/>
-</g>
-<g id="glyph-1-3">
-<path d="M 1.25 -2.71875 C 1.421875 -3.765625 2.0625 -3.984375 2.375 -3.984375 C 2.765625 -3.984375 3.21875 -3.609375 3.21875 -2.859375 C 3.21875 -2.765625 3.1875 -2.71875 3.078125 -2.71875 Z M 3.90625 -1.03125 C 3.5625 -0.65625 3.09375 -0.484375 2.546875 -0.484375 C 2.1875 -0.484375 1.71875 -0.625 1.453125 -1.0625 C 1.28125 -1.328125 1.21875 -1.734375 1.21875 -2.328125 L 3.921875 -2.328125 C 4.03125 -2.328125 4.109375 -2.375 4.109375 -2.5 C 4.109375 -3.328125 3.703125 -4.375 2.375 -4.375 C 1.359375 -4.375 0.34375 -3.546875 0.34375 -2.078125 C 0.34375 -1.515625 0.453125 -0.953125 0.78125 -0.5625 C 1.125 -0.140625 1.6875 0.09375 2.375 0.09375 C 3.09375 0.09375 3.734375 -0.265625 4.109375 -0.78125 Z M 3.90625 -1.03125 "/>
-</g>
-<g id="glyph-1-4">
-<path d="M 0.890625 -2 C 0.890625 -1.25 0.875 -0.53125 0.8125 0 L 0.828125 0.03125 C 0.953125 0.015625 1.171875 0 1.296875 0 C 1.421875 0 1.640625 0.015625 1.765625 0.03125 L 1.78125 0 C 1.703125 -0.5625 1.6875 -1.234375 1.6875 -2 L 1.6875 -4.984375 C 1.6875 -5.734375 1.734375 -6.21875 1.78125 -6.859375 C 1.78125 -6.921875 1.75 -6.953125 1.6875 -6.953125 C 1.4375 -6.859375 1.21875 -6.78125 0.828125 -6.765625 L 0.8125 -6.734375 C 0.875 -6.296875 0.890625 -5.5625 0.890625 -4.8125 Z M 0.890625 -2 "/>
-</g>
-<g id="glyph-1-5">
-<path d="M 1.890625 -5.625 C 1.890625 -5.921875 2.078125 -6.078125 2.796875 -6.078125 C 3.484375 -6.078125 4.203125 -5.875 4.203125 -4.6875 C 4.203125 -3.546875 3.65625 -3.1875 2.6875 -3.1875 C 2.4375 -3.1875 2.03125 -3.203125 1.890625 -3.28125 Z M 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 1.078125 0.015625 1.34375 0 1.46875 0 C 1.578125 0 1.859375 0.015625 1.96875 0.03125 L 2 0 C 1.90625 -0.5625 1.890625 -1.25 1.890625 -2 L 1.890625 -2.84375 C 2.109375 -2.78125 2.375 -2.75 2.734375 -2.75 C 4.546875 -2.75 5.171875 -3.890625 5.171875 -4.75 C 5.171875 -5.484375 4.703125 -6.5 2.859375 -6.5 C 2.609375 -6.5 1.765625 -6.4375 1.46875 -6.4375 C 1.34375 -6.4375 1.078125 -6.4375 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 Z M 1.03125 -4.4375 "/>
-</g>
-<g id="glyph-1-6">
-<path d="M 1.890625 -2 L 1.890625 -3.15625 C 2.375 -3.15625 3.421875 -3.125 4.109375 -3.0625 L 4.140625 -3.09375 C 4.125 -3.1875 4.109375 -3.3125 4.109375 -3.40625 C 4.109375 -3.5 4.125 -3.640625 4.140625 -3.734375 L 4.109375 -3.765625 C 3.53125 -3.703125 3.078125 -3.65625 1.890625 -3.65625 L 1.890625 -4.4375 C 1.890625 -4.609375 1.90625 -5.625 1.96875 -5.875 C 3.3125 -5.875 4.6875 -5.75 4.6875 -5.75 L 4.703125 -5.796875 C 4.6875 -5.875 4.6875 -5.96875 4.6875 -6.046875 C 4.6875 -6.125 4.6875 -6.296875 4.703125 -6.4375 L 4.6875 -6.453125 C 4.53125 -6.4375 4.296875 -6.4375 4.09375 -6.4375 L 1.46875 -6.4375 C 1.125 -6.4375 0.953125 -6.453125 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 0.953125 0.03125 1.109375 0 1.46875 0 L 4.1875 0 C 4.390625 0 4.625 0.015625 4.78125 0.03125 L 4.8125 0 C 4.796875 -0.140625 4.78125 -0.203125 4.78125 -0.3125 C 4.78125 -0.40625 4.796875 -0.5625 4.8125 -0.640625 L 4.78125 -0.671875 C 4.78125 -0.671875 3.3125 -0.5625 1.96875 -0.5625 C 1.90625 -0.796875 1.890625 -1.828125 1.890625 -2 Z M 1.890625 -2 "/>
-</g>
-<g id="glyph-1-7">
-<path d="M 2.265625 -4.4375 L 2.265625 -2 C 2.265625 -1.25 2.25 -0.53125 2.15625 0 L 2.1875 0.03125 C 2.296875 0.015625 2.578125 0 2.6875 0 C 2.8125 0 3.078125 0.015625 3.203125 0.03125 L 3.21875 0 C 3.140625 -0.5625 3.125 -1.25 3.125 -2 L 3.125 -4.4375 C 3.125 -4.609375 3.125 -5.625 3.203125 -5.875 C 4.53125 -5.875 5.21875 -5.734375 5.21875 -5.734375 L 5.234375 -5.765625 C 5.21875 -5.921875 5.21875 -6.15625 5.234375 -6.4375 L 5.21875 -6.453125 C 5.046875 -6.4375 4.828125 -6.4375 4.609375 -6.4375 L 0.765625 -6.4375 C 0.5625 -6.4375 0.328125 -6.4375 0.171875 -6.453125 L 0.15625 -6.4375 C 0.171875 -6.15625 0.171875 -5.921875 0.15625 -5.765625 L 0.171875 -5.734375 C 0.171875 -5.734375 0.84375 -5.875 2.1875 -5.875 C 2.25 -5.625 2.265625 -4.609375 2.265625 -4.4375 Z M 2.265625 -4.4375 "/>
-</g>
-<g id="glyph-1-8">
-<path d="M 2.40625 -0.34375 C 1.859375 -0.34375 1.140625 -0.828125 0.828125 -1.421875 L 0.734375 -1.40625 C 0.6875 -1.046875 0.578125 -0.671875 0.5 -0.375 L 0.515625 -0.34375 C 0.515625 -0.34375 1.171875 0.09375 2.328125 0.09375 C 3.5625 0.09375 4.5 -0.640625 4.5 -1.796875 C 4.5 -2.953125 3.515625 -3.484375 2.71875 -3.796875 C 2.21875 -3.984375 1.5 -4.28125 1.5 -5.109375 C 1.5 -5.46875 1.703125 -5.859375 1.96875 -6 C 2.140625 -6.09375 2.34375 -6.125 2.578125 -6.125 C 3.125 -6.125 3.65625 -5.6875 3.921875 -5.078125 L 4.03125 -5.078125 C 4.0625 -5.4375 4.171875 -5.78125 4.25 -6.078125 L 4.234375 -6.109375 C 4.234375 -6.109375 3.8125 -6.5625 2.65625 -6.5625 C 2.375 -6.5625 2.078125 -6.515625 1.796875 -6.390625 C 1.203125 -6.140625 0.703125 -5.5625 0.703125 -4.828125 C 0.703125 -3.78125 1.578125 -3.296875 2.40625 -2.953125 C 3.0625 -2.6875 3.578125 -2.359375 3.578125 -1.5 C 3.578125 -0.75 3 -0.34375 2.40625 -0.34375 Z M 2.40625 -0.34375 "/>
-</g>
-<g id="glyph-1-9">
-<path d="M 2.5 -3.984375 C 2.875 -3.984375 3.21875 -3.6875 3.484375 -3.234375 L 3.625 -3.25 L 3.84375 -4.0625 L 3.828125 -4.09375 C 3.515625 -4.25 3 -4.375 2.5 -4.375 C 1.46875 -4.375 0.375 -3.53125 0.375 -2.171875 C 0.375 -0.78125 1.171875 0.09375 2.375 0.09375 C 2.984375 0.09375 3.453125 -0.09375 3.84375 -0.609375 L 3.65625 -0.8125 L 3.625 -0.8125 C 3.234375 -0.484375 2.921875 -0.40625 2.546875 -0.40625 C 1.828125 -0.40625 1.25 -1.0625 1.25 -2.21875 C 1.25 -3.3125 1.828125 -3.984375 2.5 -3.984375 Z M 2.5 -3.984375 "/>
-</g>
-<g id="glyph-1-10">
-<path d="M 1.078125 -6.4375 C 1.171875 -4.53125 1.140625 -1.578125 0.984375 0 L 1 0.03125 C 1.109375 0.015625 1.21875 0 1.34375 0 C 1.46875 0 1.578125 0.015625 1.6875 0.03125 L 1.71875 0 C 1.640625 -0.5625 1.609375 -1.25 1.609375 -2 L 1.609375 -4.375 C 1.609375 -4.984375 1.625 -5.015625 2 -4.53125 L 5.484375 -0.140625 C 5.59375 0.015625 5.75 0.09375 5.90625 0.09375 C 6.046875 0.09375 6.09375 -0.015625 6.09375 -0.203125 C 6.15625 -2.484375 6.140625 -4.21875 6.296875 -6.4375 L 6.28125 -6.453125 C 6.15625 -6.4375 6.046875 -6.4375 5.9375 -6.4375 C 5.8125 -6.4375 5.703125 -6.4375 5.578125 -6.453125 L 5.5625 -6.4375 C 5.640625 -5.859375 5.65625 -5.1875 5.65625 -4.4375 L 5.65625 -1.78125 C 5.640625 -1.21875 5.5 -1.453125 5.078125 -2.03125 L 1.65625 -6.453125 C 1.65625 -6.453125 1.5625 -6.4375 1.515625 -6.4375 C 1.171875 -6.4375 1.09375 -6.453125 1.09375 -6.453125 Z M 1.078125 -6.4375 "/>
-</g>
-<g id="glyph-1-11">
-<path d="M 0.890625 -2 C 0.890625 -1.25 0.875 -0.53125 0.8125 0 L 0.828125 0.03125 C 0.953125 0.015625 1.171875 0 1.296875 0 C 1.421875 0 1.640625 0.015625 1.765625 0.03125 L 1.78125 0 C 1.703125 -0.5625 1.6875 -1.234375 1.6875 -2 L 1.6875 -2.09375 C 1.78125 -2.078125 2.015625 -2.046875 2.109375 -1.984375 C 2.75 -1.3125 2.984375 -0.953125 3.671875 0.03125 C 3.8125 0.015625 4.15625 0 4.328125 0 C 4.484375 0 4.828125 0.015625 4.921875 0.03125 L 4.9375 0 C 4.203125 -0.78125 3.671875 -1.21875 2.71875 -2.375 C 3.109375 -2.765625 4.09375 -3.71875 4.734375 -4.28125 L 4.71875 -4.3125 C 4.53125 -4.28125 3.984375 -4.28125 3.703125 -4.28125 C 3.265625 -3.65625 2.515625 -2.84375 2.140625 -2.546875 C 2 -2.4375 1.828125 -2.40625 1.6875 -2.390625 L 1.6875 -4.984375 C 1.6875 -5.734375 1.734375 -6.21875 1.78125 -6.859375 C 1.78125 -6.921875 1.75 -6.953125 1.6875 -6.953125 C 1.4375 -6.859375 1.21875 -6.78125 0.828125 -6.765625 L 0.8125 -6.734375 C 0.875 -6.296875 0.890625 -5.5625 0.890625 -4.8125 Z M 0.890625 -2 "/>
-</g>
-<g id="glyph-1-12">
-<path d="M 2.015625 0.09375 C 3.203125 0.09375 4.046875 -0.78125 4.046875 -1.953125 C 4.046875 -2.921875 3.40625 -3.78125 2.28125 -3.78125 C 1.875 -3.78125 1.421875 -3.71875 1.203125 -3.640625 L 1.40625 -5.40625 C 1.78125 -5.359375 2.203125 -5.3125 2.671875 -5.3125 C 2.96875 -5.3125 3.3125 -5.328125 3.71875 -5.375 L 3.875 -6.046875 L 3.8125 -6.078125 C 3.234375 -6.015625 2.703125 -5.984375 2.171875 -5.984375 C 1.796875 -5.984375 1.375 -6.015625 1.03125 -6.046875 L 0.71875 -3.09375 L 0.78125 -3.078125 C 1.171875 -3.234375 1.5625 -3.390625 2.03125 -3.390625 C 2.671875 -3.390625 3.171875 -2.875 3.171875 -1.78125 C 3.171875 -0.875 2.71875 -0.296875 2.03125 -0.296875 C 1.28125 -0.296875 1.15625 -0.703125 0.828125 -1.296875 L 0.6875 -1.28125 L 0.46875 -0.484375 L 0.515625 -0.453125 C 0.75 -0.234375 1.25 0.09375 2.015625 0.09375 Z M 2.015625 0.09375 "/>
-</g>
-<g id="glyph-1-13">
-<path d="M 2.328125 -5.6875 C 2.5 -5.6875 2.65625 -5.625 2.765625 -5.53125 C 3.078125 -5.28125 3.34375 -4.5 3.34375 -3.15625 C 3.34375 -2.25 3.3125 -1.71875 3.171875 -1.21875 C 2.953125 -0.390625 2.46875 -0.296875 2.28125 -0.296875 C 1.359375 -0.296875 1.25 -2 1.25 -2.859375 C 1.25 -5.328125 1.859375 -5.6875 2.328125 -5.6875 Z M 2.265625 0.09375 C 3.046875 0.09375 4.203125 -0.75 4.203125 -3.078125 C 4.203125 -4.640625 3.625 -5.40625 3.265625 -5.734375 C 3 -5.984375 2.6875 -6.078125 2.328125 -6.078125 C 1.328125 -6.078125 0.390625 -4.84375 0.390625 -2.875 C 0.390625 -1.265625 1.03125 0.09375 2.265625 0.09375 Z M 2.265625 0.09375 "/>
-</g>
-<g id="glyph-1-14">
-<path d="M 1.921875 -6.59375 C 1.578125 -4.578125 1.03125 -1.828125 0.640625 0.03125 C 0.75 0 0.84375 0 0.96875 0 C 1.078125 0 1.15625 0 1.265625 0.03125 C 1.421875 -1.015625 1.75 -3.234375 2 -4.75 L 2.046875 -4.75 C 2.796875 -3.15625 3.5 -1.546875 4.140625 0 L 4.328125 0 C 5.046875 -1.640625 5.75 -3.15625 6.546875 -4.71875 L 6.5625 -4.703125 C 6.78125 -3.140625 6.984375 -1.609375 7.125 0.03125 C 7.28125 0 7.46875 0 7.609375 0 C 7.765625 0 8.03125 0 8.171875 0.03125 C 7.8125 -2.109375 7.515625 -4.171875 7.234375 -6.59375 L 6.984375 -6.59375 L 4.5625 -1.65625 L 4.484375 -1.65625 C 3.703125 -3.296875 2.984375 -4.890625 2.25 -6.59375 Z M 1.921875 -6.59375 "/>
-</g>
-<g id="glyph-1-15">
-<path d="M 4.0625 -6.4375 L 1.46875 -6.4375 C 1.125 -6.4375 0.953125 -6.453125 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 0.953125 0.03125 1.109375 0 1.46875 0 C 1.796875 0 1.96875 0.03125 1.96875 0.03125 L 2 0 C 1.90625 -0.5625 1.890625 -1.25 1.890625 -2 L 1.890625 -3.15625 C 2.375 -3.15625 3.421875 -3.125 4.109375 -3.0625 L 4.140625 -3.09375 C 4.125 -3.1875 4.109375 -3.3125 4.109375 -3.40625 C 4.109375 -3.5 4.125 -3.640625 4.140625 -3.734375 L 4.109375 -3.765625 C 3.53125 -3.703125 3.078125 -3.65625 1.890625 -3.65625 L 1.890625 -4.4375 C 1.890625 -4.609375 1.90625 -5.625 1.96875 -5.875 C 3.3125 -5.875 4.6875 -5.75 4.6875 -5.75 L 4.703125 -5.796875 C 4.6875 -5.875 4.6875 -6 4.6875 -6.09375 C 4.6875 -6.171875 4.6875 -6.296875 4.703125 -6.4375 L 4.6875 -6.453125 C 4.53125 -6.4375 4.296875 -6.4375 4.0625 -6.4375 Z M 4.0625 -6.4375 "/>
-</g>
-<g id="glyph-1-16">
-<path d="M 0.796875 -5.953125 C 0.796875 -5.6875 1.03125 -5.4375 1.3125 -5.4375 C 1.578125 -5.4375 1.8125 -5.6875 1.8125 -5.953125 C 1.8125 -6.21875 1.578125 -6.453125 1.3125 -6.453125 C 1.03125 -6.453125 0.796875 -6.21875 0.796875 -5.953125 Z M 0.890625 -2.34375 L 0.890625 -1.796875 C 0.890625 -1.046875 0.875 -0.53125 0.8125 0 L 0.828125 0.03125 C 0.953125 0.015625 1.171875 0 1.296875 0 C 1.421875 0 1.640625 0.015625 1.765625 0.03125 L 1.78125 0 C 1.703125 -0.5625 1.6875 -1.03125 1.6875 -1.796875 L 1.6875 -2.5 C 1.6875 -3.25 1.71875 -3.609375 1.78125 -4.21875 C 1.78125 -4.3125 1.765625 -4.3125 1.6875 -4.3125 C 1.46875 -4.28125 1.03125 -4.28125 0.828125 -4.3125 L 0.8125 -4.28125 C 0.875 -3.84375 0.890625 -3.09375 0.890625 -2.34375 Z M 0.890625 -2.34375 "/>
-</g>
-<g id="glyph-1-17">
-<path d="M 1.609375 -0.8125 L 1.609375 -3.390625 C 2 -3.78125 2.25 -3.984375 2.65625 -3.984375 C 3.265625 -3.984375 3.8125 -3.46875 3.8125 -2.234375 C 3.8125 -1 3.4375 -0.296875 2.5 -0.296875 C 2.1875 -0.296875 1.8125 -0.5625 1.609375 -0.8125 Z M 1.609375 -4.984375 C 1.609375 -5.71875 1.625 -6.234375 1.671875 -6.859375 C 1.671875 -6.921875 1.640625 -6.953125 1.578125 -6.953125 C 1.328125 -6.859375 1.109375 -6.78125 0.71875 -6.765625 L 0.703125 -6.734375 C 0.765625 -6.296875 0.8125 -5.5625 0.8125 -4.8125 L 0.8125 -0.734375 C 0.8125 -0.34375 0.8125 -0.1875 0.765625 0.015625 C 0.828125 0.078125 0.9375 0.09375 1.046875 0.09375 C 1.15625 -0.015625 1.265625 -0.1875 1.375 -0.390625 C 1.671875 -0.140625 2.125 0.09375 2.546875 0.09375 C 3.5625 0.09375 4.6875 -0.625 4.6875 -2.3125 C 4.6875 -3.53125 3.828125 -4.375 2.859375 -4.375 C 2.375 -4.375 1.953125 -4.25 1.609375 -3.875 Z M 1.609375 -4.984375 "/>
-</g>
-<g id="glyph-1-18">
-<path d="M 3.78125 -6.5625 C 2.109375 -6.5625 0.515625 -5.109375 0.515625 -3.140625 C 0.515625 -1.421875 1.5 0.09375 3.65625 0.09375 C 4.5625 0.09375 5.4375 -0.171875 6.09375 -0.984375 C 6.09375 -1.109375 6.078125 -1.328125 6.046875 -1.421875 L 5.96875 -1.453125 C 5.265625 -0.671875 4.671875 -0.40625 3.734375 -0.40625 C 2.46875 -0.40625 1.546875 -1.78125 1.546875 -3.328125 C 1.546875 -5.359375 2.828125 -6.09375 3.65625 -6.09375 C 4.546875 -6.09375 5.28125 -5.734375 5.6875 -4.921875 L 5.796875 -4.921875 C 5.828125 -5.4375 5.875 -5.609375 5.984375 -6.03125 L 5.96875 -6.0625 C 5.96875 -6.0625 5 -6.5625 3.78125 -6.5625 Z M 3.78125 -6.5625 "/>
-</g>
-<g id="glyph-1-19">
-<path d="M 1.890625 -0.8125 L 1.890625 -5.625 C 1.890625 -5.96875 2.25 -6.015625 2.734375 -6.015625 C 4.890625 -6.015625 5.53125 -4.359375 5.53125 -2.828125 C 5.53125 -0.8125 4.390625 -0.421875 2.984375 -0.421875 C 2 -0.421875 1.890625 -0.5 1.890625 -0.8125 Z M 1.46875 -6.4375 C 1.125 -6.4375 0.953125 -6.453125 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 0.953125 0.03125 1.109375 0 1.46875 0 C 2.171875 0 2.265625 0.015625 3.296875 0.015625 C 4.6875 0.015625 6.5625 -0.625 6.5625 -3.078125 C 6.5625 -4.921875 5.03125 -6.453125 3.109375 -6.453125 C 2.46875 -6.453125 2.109375 -6.4375 1.46875 -6.4375 Z M 1.46875 -6.4375 "/>
-</g>
-<g id="glyph-1-20">
-<path d="M 3.640625 -0.921875 L 3.671875 -0.890625 L 3.734375 0 C 3.734375 0.015625 3.734375 0.03125 3.765625 0.03125 C 3.890625 0.015625 4.015625 0 4.15625 0 C 4.28125 0 4.46875 0.015625 4.609375 0.03125 L 4.625 0 C 4.546875 -0.421875 4.453125 -1.171875 4.453125 -1.90625 L 4.453125 -2.484375 C 4.453125 -3.234375 4.46875 -3.75 4.546875 -4.28125 L 4.53125 -4.3125 C 4.40625 -4.28125 4.171875 -4.28125 4.0625 -4.28125 C 3.9375 -4.28125 3.703125 -4.28125 3.59375 -4.3125 L 3.5625 -4.28125 C 3.640625 -3.703125 3.65625 -3.234375 3.65625 -2.484375 L 3.65625 -1.5 C 3.34375 -0.984375 2.78125 -0.453125 2.296875 -0.453125 C 1.9375 -0.453125 1.625 -0.5625 1.625 -1.5 L 1.625 -2.484375 C 1.625 -3.234375 1.640625 -3.75 1.71875 -4.28125 L 1.6875 -4.3125 C 1.578125 -4.28125 1.34375 -4.28125 1.21875 -4.28125 C 1.109375 -4.28125 0.875 -4.28125 0.75 -4.3125 L 0.734375 -4.28125 C 0.8125 -3.703125 0.828125 -3.234375 0.828125 -2.484375 L 0.828125 -1.28125 C 0.828125 -0.59375 1.140625 0.09375 2.140625 0.09375 C 2.75 0.09375 3.3125 -0.40625 3.640625 -0.921875 Z M 3.640625 -0.921875 "/>
-</g>
-<g id="glyph-1-21">
-<path d="M 1.671875 -3.328125 L 1.671875 -4.234375 C 1.671875 -4.3125 1.640625 -4.34375 1.609375 -4.34375 C 1.375 -4.28125 1.015625 -4.28125 0.796875 -4.3125 L 0.78125 -4.28125 C 0.84375 -3.84375 0.875 -3.09375 0.875 -2.34375 L 0.875 -1.796875 C 0.875 -1.046875 0.84375 -0.53125 0.78125 0 L 0.796875 0.03125 C 0.921875 0.015625 1.140625 0 1.265625 0 C 1.390625 0 1.609375 0.015625 1.734375 0.03125 L 1.75 0 C 1.671875 -0.5625 1.671875 -1.03125 1.671875 -1.796875 L 1.671875 -2.3125 C 1.671875 -2.734375 1.78125 -2.96875 2 -3.296875 C 2.140625 -3.515625 2.375 -3.65625 2.5625 -3.65625 C 2.765625 -3.65625 2.953125 -3.625 3.078125 -3.515625 L 3.15625 -3.53125 L 3.359375 -4.265625 L 3.3125 -4.3125 C 3.15625 -4.359375 3.140625 -4.375 2.953125 -4.375 C 2.421875 -4.375 2.140625 -4.0625 1.6875 -3.3125 Z M 1.671875 -3.328125 "/>
-</g>
-<g id="glyph-1-22">
-<path d="M 4.0625 -2.734375 C 3.734375 -2.734375 3.3125 -2.71875 3.015625 -2.71875 C 2.734375 -2.71875 2.1875 -2.734375 1.96875 -2.734375 L 3.03125 -5.1875 L 3.046875 -5.1875 C 3.453125 -4.25 3.78125 -3.4375 4.0625 -2.734375 Z M 1.78125 -2.265625 C 2.03125 -2.296875 2.71875 -2.296875 3.078125 -2.296875 C 3.453125 -2.296875 4 -2.296875 4.234375 -2.265625 C 4.71875 -0.96875 4.9375 -0.1875 5.015625 0.03125 C 5.1875 0 5.34375 0 5.515625 0 C 5.6875 0 5.9375 0 6.109375 0.03125 C 5.609375 -0.984375 4.296875 -4.375 3.375 -6.5625 L 3.09375 -6.5625 C 2.140625 -4.34375 1.171875 -2.171875 0.15625 0.03125 C 0.28125 0 0.40625 0 0.515625 0 C 0.625 0 0.84375 0 0.953125 0.03125 C 1.109375 -0.578125 1.421875 -1.390625 1.78125 -2.265625 Z M 1.78125 -2.265625 "/>
-</g>
-<g id="glyph-1-23">
-<path d="M 3.234375 -1.3125 L 3.203125 -1.3125 C 1.890625 -4.34375 1.390625 -6.109375 1.3125 -6.453125 C 1.140625 -6.4375 0.90625 -6.4375 0.75 -6.4375 C 0.59375 -6.4375 0.3125 -6.4375 0.15625 -6.453125 C 0.65625 -5.453125 1.96875 -2.078125 2.890625 0.09375 L 3.171875 0.09375 C 4.125 -2.109375 5.109375 -4.25 6.109375 -6.453125 C 5.984375 -6.4375 5.796875 -6.4375 5.71875 -6.4375 C 5.609375 -6.4375 5.375 -6.4375 5.265625 -6.453125 C 4.921875 -5.171875 3.9375 -2.96875 3.234375 -1.3125 Z M 3.234375 -1.3125 "/>
-</g>
-<g id="glyph-1-24">
-<path d="M 5.625 0.03125 C 4.8125 -1.109375 4.171875 -2.109375 3.296875 -3.4375 C 4.203125 -4.78125 5.03125 -5.859375 5.46875 -6.453125 C 5.359375 -6.4375 5.15625 -6.4375 5.046875 -6.4375 C 4.9375 -6.4375 4.734375 -6.4375 4.625 -6.453125 C 4.078125 -5.453125 3.84375 -5.09375 3.015625 -3.890625 C 2.359375 -4.890625 1.6875 -5.859375 1.40625 -6.453125 C 1.21875 -6.4375 1 -6.4375 0.84375 -6.4375 C 0.671875 -6.4375 0.453125 -6.4375 0.28125 -6.453125 L 2.4375 -3.203125 L 0.234375 0.03125 C 0.34375 0 0.53125 0 0.640625 0 C 0.765625 0 0.953125 0 1.0625 0.03125 C 1.59375 -0.953125 2.171875 -1.890625 2.734375 -2.734375 C 3.375 -1.78125 3.9375 -0.96875 4.46875 0.03125 C 4.640625 0 4.859375 0 5.046875 0 C 5.21875 0 5.4375 0 5.625 0.03125 Z M 5.625 0.03125 "/>
-</g>
-<g id="glyph-1-25">
-<path d="M 1.90625 -4.4375 C 1.90625 -5.1875 1.921875 -5.90625 2 -6.4375 L 2 -6.453125 C 1.875 -6.4375 1.59375 -6.4375 1.46875 -6.4375 C 1.359375 -6.4375 1.09375 -6.4375 0.96875 -6.453125 L 0.953125 -6.4375 C 1.03125 -5.859375 1.046875 -5.1875 1.046875 -4.4375 L 1.046875 -2 C 1.046875 -1.25 1.03125 -0.53125 0.953125 0 L 0.953125 0.03125 C 0.953125 0.03125 1.140625 0 1.46875 0 L 4.203125 0 C 4.40625 0 4.640625 0.015625 4.796875 0.03125 L 4.8125 0 C 4.8125 -0.140625 4.796875 -0.34375 4.796875 -0.4375 C 4.796875 -0.53125 4.8125 -0.703125 4.8125 -0.78125 L 4.796875 -0.828125 C 4.796875 -0.828125 3.3125 -0.609375 1.984375 -0.609375 C 1.90625 -0.84375 1.90625 -1.828125 1.90625 -2 Z M 1.90625 -4.4375 "/>
-</g>
-<g id="glyph-1-26">
-<path d="M 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 1.078125 0.015625 1.34375 0 1.46875 0 C 1.578125 0 1.859375 0.015625 1.96875 0.03125 L 2 0 C 1.90625 -0.5625 1.890625 -1.25 1.890625 -2 L 1.890625 -4.4375 C 1.890625 -5.1875 1.90625 -5.90625 2 -6.4375 L 1.96875 -6.453125 C 1.859375 -6.4375 1.578125 -6.4375 1.46875 -6.4375 C 1.34375 -6.4375 1.078125 -6.4375 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 Z M 1.03125 -4.4375 "/>
-</g>
-<g id="glyph-1-27">
-<path d="M 1.890625 -5.578125 C 1.890625 -5.90625 2 -6.03125 2.796875 -6.03125 C 3.359375 -6.03125 4.0625 -5.8125 4.0625 -4.796875 C 4.0625 -3.953125 3.5 -3.703125 2.671875 -3.703125 L 1.890625 -3.703125 Z M 1.890625 -3.265625 L 2.828125 -3.265625 C 3.984375 -3.265625 4.53125 -2.546875 4.53125 -1.640625 C 4.53125 -0.953125 4.296875 -0.40625 2.859375 -0.40625 C 2.15625 -0.40625 1.890625 -0.546875 1.890625 -0.875 Z M 1.46875 -6.4375 C 1.125 -6.4375 0.953125 -6.453125 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 0.953125 0.03125 1.109375 0 1.46875 0 C 2.078125 0 2.15625 0.03125 3.046875 0.03125 C 5.03125 0.03125 5.515625 -0.984375 5.515625 -1.828125 C 5.515625 -2.796875 4.875 -3.34375 4 -3.59375 C 4.515625 -3.84375 4.9375 -4.359375 4.9375 -4.921875 C 4.9375 -5.578125 4.578125 -6.453125 2.734375 -6.453125 C 2.375 -6.453125 1.96875 -6.4375 1.46875 -6.4375 Z M 1.46875 -6.4375 "/>
-</g>
-<g id="glyph-1-28">
-<path d="M 5.421875 -4.4375 L 5.421875 -2.765625 C 5.421875 -1.640625 5.25 -0.453125 3.5625 -0.453125 C 1.890625 -0.453125 1.890625 -2.140625 1.890625 -2.6875 L 1.890625 -4.4375 C 1.890625 -5.1875 1.90625 -5.90625 2 -6.4375 L 1.96875 -6.453125 C 1.859375 -6.4375 1.578125 -6.4375 1.46875 -6.4375 C 1.34375 -6.4375 1.078125 -6.4375 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2.359375 C 1.03125 -0.3125 2.46875 0.09375 3.359375 0.09375 C 5.40625 0.09375 5.96875 -1.171875 5.96875 -2.9375 L 5.96875 -4.4375 C 5.96875 -5.1875 5.984375 -5.90625 6.0625 -6.4375 L 6.046875 -6.453125 C 5.921875 -6.4375 5.8125 -6.4375 5.6875 -6.4375 C 5.578125 -6.4375 5.46875 -6.4375 5.34375 -6.453125 L 5.328125 -6.4375 C 5.40625 -5.859375 5.421875 -5.1875 5.421875 -4.4375 Z M 5.421875 -4.4375 "/>
-</g>
-<g id="glyph-1-29">
-<path d="M 5.34375 -4.4375 L 5.34375 -3.640625 L 1.890625 -3.640625 L 1.890625 -4.4375 C 1.890625 -5.1875 1.90625 -5.90625 2 -6.4375 L 1.984375 -6.453125 C 1.859375 -6.4375 1.578125 -6.4375 1.46875 -6.4375 C 1.34375 -6.4375 1.078125 -6.4375 0.953125 -6.453125 L 0.9375 -6.4375 C 1.015625 -5.859375 1.03125 -5.1875 1.03125 -4.4375 L 1.03125 -2 C 1.03125 -1.25 1.015625 -0.53125 0.9375 0 L 0.953125 0.03125 C 1.0625 0.015625 1.34375 0 1.46875 0 C 1.578125 0 1.859375 0.015625 1.96875 0.03125 L 2 0 C 1.90625 -0.5625 1.890625 -1.25 1.890625 -2 L 1.890625 -3.203125 L 5.34375 -3.203125 L 5.34375 -2 C 5.34375 -1.25 5.328125 -0.53125 5.25 0 L 5.25 0.03125 C 5.375 0.015625 5.65625 0 5.765625 0 C 5.890625 0 6.15625 0.015625 6.28125 0.03125 L 6.296875 0 C 6.21875 -0.5625 6.203125 -1.25 6.203125 -2 L 6.203125 -4.4375 C 6.203125 -5.1875 6.21875 -5.90625 6.296875 -6.4375 L 6.296875 -6.453125 C 6.171875 -6.4375 5.890625 -6.4375 5.765625 -6.4375 C 5.65625 -6.4375 5.390625 -6.4375 5.265625 -6.453125 L 5.25 -6.4375 C 5.328125 -5.859375 5.34375 -5.1875 5.34375 -4.4375 Z M 5.34375 -4.4375 "/>
-</g>
-<g id="glyph-1-30">
-<path d="M 3.3125 -2.84375 C 3.734375 -3.640625 4.421875 -4.609375 5.625 -6.453125 C 5.5 -6.4375 5.328125 -6.4375 5.203125 -6.4375 C 5.078125 -6.4375 4.90625 -6.4375 4.78125 -6.453125 C 4.28125 -5.40625 3.625 -4.296875 3.046875 -3.390625 C 2.40625 -4.46875 1.828125 -5.40625 1.296875 -6.453125 C 1.125 -6.4375 0.890625 -6.4375 0.734375 -6.4375 C 0.5625 -6.4375 0.328125 -6.4375 0.15625 -6.453125 C 0.5625 -5.859375 2.078125 -3.390625 2.46875 -2.734375 C 2.46875 -1.765625 2.4375 -0.515625 2.390625 0.03125 C 2.515625 0.015625 2.765625 0 2.890625 0 C 3.015625 0 3.265625 0.015625 3.390625 0.03125 C 3.34375 -0.453125 3.328125 -1.859375 3.3125 -2.84375 Z M 3.3125 -2.84375 "/>
-</g>
-<g id="glyph-1-31">
-<path d="M 5.4375 -1.171875 L 5.4375 -0.71875 C 5.109375 -0.40625 4.515625 -0.34375 3.984375 -0.34375 C 2.203125 -0.34375 1.546875 -1.9375 1.546875 -3.265625 C 1.546875 -5 2.515625 -6.125 3.859375 -6.125 C 4.75 -6.125 5.515625 -5.625 5.96875 -4.859375 L 6.09375 -4.875 C 6.125 -5.390625 6.1875 -5.65625 6.296875 -6.03125 L 6.265625 -6.0625 C 6.265625 -6.0625 5.21875 -6.5625 4 -6.5625 C 2.28125 -6.5625 0.515625 -5.359375 0.515625 -3.15625 C 0.515625 -1.4375 1.75 0.09375 3.75 0.09375 C 4.90625 0.09375 5.71875 -0.234375 6.40625 -0.796875 L 6.40625 -0.828125 C 6.3125 -0.921875 6.296875 -1.203125 6.296875 -1.3125 L 6.296875 -1.359375 C 6.296875 -2.109375 6.3125 -2.53125 6.40625 -3.0625 L 6.390625 -3.09375 C 6.390625 -3.09375 6.21875 -3.0625 5.875 -3.0625 C 5.53125 -3.0625 5.359375 -3.09375 5.359375 -3.09375 L 5.34375 -3.0625 C 5.421875 -2.5 5.4375 -1.90625 5.4375 -1.171875 Z M 5.4375 -1.171875 "/>
-</g>
-<g id="glyph-1-32">
-<path d="M 4.578125 -1.796875 C 4.578125 -2.109375 4.609375 -2.484375 4.609375 -2.796875 C 4.609375 -3.84375 4.265625 -4.375 3.375 -4.375 C 2.984375 -4.375 2.28125 -4.234375 1.640625 -3.484375 L 1.625 -3.515625 L 1.625 -4.234375 C 1.609375 -4.3125 1.609375 -4.34375 1.5625 -4.34375 C 1.328125 -4.28125 0.984375 -4.28125 0.75 -4.3125 L 0.734375 -4.28125 C 0.8125 -3.84375 0.828125 -3.09375 0.828125 -2.34375 L 0.828125 -1.796875 C 0.828125 -1.046875 0.8125 -0.53125 0.734375 0 L 0.75 0.03125 C 0.875 0.015625 1.109375 0 1.21875 0 C 1.34375 0 1.578125 0.015625 1.6875 0.03125 L 1.71875 0 C 1.640625 -0.5625 1.625 -1.03125 1.625 -1.796875 L 1.625 -2.953125 C 2.140625 -3.5625 2.71875 -3.78125 3.078125 -3.78125 C 3.578125 -3.78125 3.78125 -3.578125 3.78125 -2.765625 L 3.78125 -1.796875 C 3.78125 -1.046875 3.765625 -0.53125 3.703125 0 L 3.71875 0.03125 C 3.84375 0.015625 4.0625 0 4.1875 0 C 4.3125 0 4.53125 0.015625 4.65625 0.03125 L 4.671875 0 C 4.59375 -0.5625 4.578125 -1.03125 4.578125 -1.796875 Z M 4.578125 -1.796875 "/>
-</g>
-</g>
-<clipPath id="clip-0">
-<path clip-rule="nonzero" d="M 8.953125 193.160156 L 54.308594 193.160156 L 54.308594 215.839844 L 8.953125 215.839844 Z M 8.953125 193.160156 "/>
-</clipPath>
-<linearGradient id="linear-pattern-0" gradientUnits="userSpaceOnUse" x1="0" y1="25.002712" x2="0" y2="74.997244" gradientTransform="matrix(0.90721, 0, 0, -0.4536, -13.72998, 227.18055)">
-<stop offset="0" stop-color="rgb(91.372681%, 71.765137%, 71.765137%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(91.423035%, 71.929932%, 71.929932%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(91.523743%, 72.261047%, 72.261047%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(91.624451%, 72.592163%, 72.592163%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(91.726685%, 72.923279%, 72.923279%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(91.827393%, 73.252869%, 73.252869%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(91.928101%, 73.583984%, 73.583984%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(92.028809%, 73.9151%, 73.9151%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(92.131042%, 74.246216%, 74.246216%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(92.23175%, 74.577332%, 74.577332%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(92.332458%, 74.906921%, 74.906921%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(92.433167%, 75.238037%, 75.238037%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(92.5354%, 75.569153%, 75.569153%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(92.636108%, 75.900269%, 75.900269%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(92.736816%, 76.231384%, 76.231384%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(92.837524%, 76.560974%, 76.560974%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(92.939758%, 76.89209%, 76.89209%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(93.040466%, 77.223206%, 77.223206%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(93.141174%, 77.554321%, 77.554321%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(93.241882%, 77.885437%, 77.885437%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(93.344116%, 78.216553%, 78.216553%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(93.444824%, 78.546143%, 78.546143%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(93.545532%, 78.877258%, 78.877258%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(93.64624%, 79.208374%, 79.208374%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(93.748474%, 79.53949%, 79.53949%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(93.849182%, 79.870605%, 79.870605%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(93.94989%, 80.200195%, 80.200195%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(94.050598%, 80.531311%, 80.531311%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(94.152832%, 80.862427%, 80.862427%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(94.25354%, 81.193542%, 81.193542%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(94.354248%, 81.524658%, 81.524658%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(94.454956%, 81.855774%, 81.855774%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(94.555664%, 82.185364%, 82.185364%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(94.657898%, 82.516479%, 82.516479%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(94.758606%, 82.847595%, 82.847595%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(94.859314%, 83.178711%, 83.178711%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(94.961548%, 83.509827%, 83.509827%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(95.062256%, 83.839417%, 83.839417%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(95.162964%, 84.170532%, 84.170532%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(95.263672%, 84.501648%, 84.501648%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(95.365906%, 84.832764%, 84.832764%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(95.466614%, 85.163879%, 85.163879%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(95.567322%, 85.494995%, 85.494995%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(95.66803%, 85.824585%, 85.824585%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(95.770264%, 86.155701%, 86.155701%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(95.870972%, 86.486816%, 86.486816%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(95.97168%, 86.817932%, 86.817932%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(96.072388%, 87.149048%, 87.149048%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(96.174622%, 87.478638%, 87.478638%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(96.27533%, 87.809753%, 87.809753%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(96.376038%, 88.140869%, 88.140869%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(96.478271%, 88.471985%, 88.471985%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(96.578979%, 88.803101%, 88.803101%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(96.679688%, 89.13269%, 89.13269%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(96.780396%, 89.463806%, 89.463806%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(96.882629%, 89.794922%, 89.794922%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(96.983337%, 90.126038%, 90.126038%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(97.084045%, 90.457153%, 90.457153%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(97.184753%, 90.788269%, 90.788269%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(97.286987%, 91.117859%, 91.117859%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(97.387695%, 91.448975%, 91.448975%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(97.488403%, 91.78009%, 91.78009%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(97.589111%, 92.111206%, 92.111206%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(97.691345%, 92.442322%, 92.442322%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(97.792053%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-1">
-<path clip-rule="nonzero" d="M 8.953125 159.144531 L 54.308594 159.144531 L 54.308594 181.824219 L 8.953125 181.824219 Z M 8.953125 159.144531 "/>
-</clipPath>
-<linearGradient id="linear-pattern-1" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997244" gradientTransform="matrix(0.90721, 0, 0, -0.4536, -13.72998, 193.16448)">
-<stop offset="0" stop-color="rgb(91.372681%, 71.765137%, 71.765137%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(91.423035%, 71.929932%, 71.929932%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(91.523743%, 72.261047%, 72.261047%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(91.624451%, 72.592163%, 72.592163%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(91.726685%, 72.923279%, 72.923279%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(91.827393%, 73.252869%, 73.252869%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(91.928101%, 73.583984%, 73.583984%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(92.028809%, 73.9151%, 73.9151%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(92.131042%, 74.246216%, 74.246216%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(92.23175%, 74.577332%, 74.577332%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(92.332458%, 74.906921%, 74.906921%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(92.433167%, 75.238037%, 75.238037%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(92.5354%, 75.569153%, 75.569153%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(92.636108%, 75.900269%, 75.900269%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(92.736816%, 76.231384%, 76.231384%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(92.837524%, 76.560974%, 76.560974%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(92.939758%, 76.89209%, 76.89209%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(93.040466%, 77.223206%, 77.223206%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(93.141174%, 77.554321%, 77.554321%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(93.241882%, 77.885437%, 77.885437%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(93.344116%, 78.216553%, 78.216553%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(93.444824%, 78.546143%, 78.546143%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(93.545532%, 78.877258%, 78.877258%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(93.64624%, 79.208374%, 79.208374%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(93.748474%, 79.53949%, 79.53949%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(93.849182%, 79.870605%, 79.870605%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(93.94989%, 80.200195%, 80.200195%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(94.050598%, 80.531311%, 80.531311%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(94.152832%, 80.862427%, 80.862427%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(94.25354%, 81.193542%, 81.193542%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(94.354248%, 81.524658%, 81.524658%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(94.454956%, 81.855774%, 81.855774%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(94.555664%, 82.185364%, 82.185364%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(94.657898%, 82.516479%, 82.516479%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(94.758606%, 82.847595%, 82.847595%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(94.859314%, 83.178711%, 83.178711%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(94.961548%, 83.509827%, 83.509827%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(95.062256%, 83.839417%, 83.839417%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(95.162964%, 84.170532%, 84.170532%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(95.263672%, 84.501648%, 84.501648%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(95.365906%, 84.832764%, 84.832764%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(95.466614%, 85.163879%, 85.163879%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(95.567322%, 85.494995%, 85.494995%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(95.66803%, 85.824585%, 85.824585%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(95.770264%, 86.155701%, 86.155701%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(95.870972%, 86.486816%, 86.486816%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(95.97168%, 86.817932%, 86.817932%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(96.072388%, 87.149048%, 87.149048%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(96.174622%, 87.478638%, 87.478638%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(96.27533%, 87.809753%, 87.809753%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(96.376038%, 88.140869%, 88.140869%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(96.478271%, 88.471985%, 88.471985%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(96.578979%, 88.803101%, 88.803101%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(96.679688%, 89.13269%, 89.13269%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(96.780396%, 89.463806%, 89.463806%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(96.882629%, 89.794922%, 89.794922%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(96.983337%, 90.126038%, 90.126038%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(97.084045%, 90.457153%, 90.457153%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(97.184753%, 90.788269%, 90.788269%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(97.286987%, 91.117859%, 91.117859%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(97.387695%, 91.448975%, 91.448975%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(97.488403%, 91.78009%, 91.78009%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(97.589111%, 92.111206%, 92.111206%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(97.691345%, 92.442322%, 92.442322%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(97.792053%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-2">
-<path clip-rule="nonzero" d="M 8.953125 125.128906 L 54.308594 125.128906 L 54.308594 147.808594 L 8.953125 147.808594 Z M 8.953125 125.128906 "/>
-</clipPath>
-<linearGradient id="linear-pattern-2" gradientUnits="userSpaceOnUse" x1="0" y1="25.002778" x2="0" y2="74.99731" gradientTransform="matrix(0.90721, 0, 0, -0.4536, -13.72998, 159.14843)">
-<stop offset="0" stop-color="rgb(91.372681%, 71.765137%, 71.765137%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(91.423035%, 71.929932%, 71.929932%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(91.523743%, 72.261047%, 72.261047%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(91.624451%, 72.592163%, 72.592163%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(91.726685%, 72.923279%, 72.923279%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(91.827393%, 73.252869%, 73.252869%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(91.928101%, 73.583984%, 73.583984%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(92.028809%, 73.9151%, 73.9151%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(92.131042%, 74.246216%, 74.246216%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(92.23175%, 74.577332%, 74.577332%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(92.332458%, 74.906921%, 74.906921%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(92.433167%, 75.238037%, 75.238037%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(92.5354%, 75.569153%, 75.569153%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(92.636108%, 75.900269%, 75.900269%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(92.736816%, 76.231384%, 76.231384%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(92.837524%, 76.5625%, 76.5625%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(92.939758%, 76.89209%, 76.89209%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(93.040466%, 77.223206%, 77.223206%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(93.141174%, 77.554321%, 77.554321%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(93.241882%, 77.885437%, 77.885437%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(93.344116%, 78.216553%, 78.216553%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(93.444824%, 78.546143%, 78.546143%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(93.545532%, 78.877258%, 78.877258%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(93.64624%, 79.208374%, 79.208374%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(93.748474%, 79.53949%, 79.53949%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(93.849182%, 79.870605%, 79.870605%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(93.94989%, 80.200195%, 80.200195%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(94.050598%, 80.531311%, 80.531311%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(94.152832%, 80.862427%, 80.862427%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(94.25354%, 81.193542%, 81.193542%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(94.354248%, 81.524658%, 81.524658%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(94.454956%, 81.855774%, 81.855774%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(94.55719%, 82.185364%, 82.185364%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(94.657898%, 82.516479%, 82.516479%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(94.758606%, 82.847595%, 82.847595%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(94.859314%, 83.178711%, 83.178711%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(94.961548%, 83.509827%, 83.509827%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(95.062256%, 83.839417%, 83.839417%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(95.162964%, 84.170532%, 84.170532%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(95.263672%, 84.501648%, 84.501648%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(95.365906%, 84.832764%, 84.832764%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(95.466614%, 85.163879%, 85.163879%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(95.567322%, 85.494995%, 85.494995%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(95.66803%, 85.824585%, 85.824585%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(95.770264%, 86.155701%, 86.155701%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(95.870972%, 86.486816%, 86.486816%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(95.97168%, 86.817932%, 86.817932%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(96.072388%, 87.149048%, 87.149048%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(96.174622%, 87.478638%, 87.478638%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(96.27533%, 87.809753%, 87.809753%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(96.376038%, 88.140869%, 88.140869%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(96.478271%, 88.471985%, 88.471985%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(96.578979%, 88.803101%, 88.803101%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(96.679688%, 89.134216%, 89.134216%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(96.780396%, 89.463806%, 89.463806%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(96.882629%, 89.794922%, 89.794922%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(96.983337%, 90.126038%, 90.126038%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(97.084045%, 90.457153%, 90.457153%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(97.184753%, 90.788269%, 90.788269%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(97.286987%, 91.117859%, 91.117859%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(97.387695%, 91.448975%, 91.448975%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(97.488403%, 91.78009%, 91.78009%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(97.589111%, 92.111206%, 92.111206%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(97.691345%, 92.442322%, 92.442322%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(97.792053%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-3">
-<path clip-rule="nonzero" d="M 8.953125 91.113281 L 54.308594 91.113281 L 54.308594 113.792969 L 8.953125 113.792969 Z M 8.953125 91.113281 "/>
-</clipPath>
-<linearGradient id="linear-pattern-3" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(0.90721, 0, 0, -0.4536, -13.72998, 125.13233)">
-<stop offset="0" stop-color="rgb(91.372681%, 71.765137%, 71.765137%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(91.423035%, 71.929932%, 71.929932%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(91.523743%, 72.261047%, 72.261047%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(91.624451%, 72.592163%, 72.592163%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(91.726685%, 72.923279%, 72.923279%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(91.827393%, 73.252869%, 73.252869%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(91.928101%, 73.583984%, 73.583984%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(92.028809%, 73.9151%, 73.9151%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(92.131042%, 74.246216%, 74.246216%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(92.23175%, 74.577332%, 74.577332%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(92.332458%, 74.906921%, 74.906921%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(92.433167%, 75.238037%, 75.238037%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(92.5354%, 75.569153%, 75.569153%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(92.636108%, 75.900269%, 75.900269%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(92.736816%, 76.231384%, 76.231384%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(92.837524%, 76.5625%, 76.5625%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(92.939758%, 76.89209%, 76.89209%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(93.040466%, 77.223206%, 77.223206%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(93.141174%, 77.554321%, 77.554321%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(93.241882%, 77.885437%, 77.885437%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(93.344116%, 78.216553%, 78.216553%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(93.444824%, 78.546143%, 78.546143%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(93.545532%, 78.877258%, 78.877258%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(93.64624%, 79.208374%, 79.208374%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(93.748474%, 79.53949%, 79.53949%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(93.849182%, 79.870605%, 79.870605%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(93.94989%, 80.200195%, 80.200195%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(94.050598%, 80.531311%, 80.531311%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(94.152832%, 80.862427%, 80.862427%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(94.25354%, 81.193542%, 81.193542%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(94.354248%, 81.524658%, 81.524658%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(94.454956%, 81.855774%, 81.855774%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(94.555664%, 82.185364%, 82.185364%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(94.657898%, 82.516479%, 82.516479%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(94.758606%, 82.847595%, 82.847595%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(94.859314%, 83.178711%, 83.178711%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(94.961548%, 83.509827%, 83.509827%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(95.062256%, 83.839417%, 83.839417%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(95.162964%, 84.170532%, 84.170532%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(95.263672%, 84.501648%, 84.501648%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(95.365906%, 84.832764%, 84.832764%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(95.466614%, 85.163879%, 85.163879%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(95.567322%, 85.494995%, 85.494995%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(95.66803%, 85.824585%, 85.824585%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(95.770264%, 86.155701%, 86.155701%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(95.870972%, 86.486816%, 86.486816%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(95.97168%, 86.817932%, 86.817932%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(96.072388%, 87.149048%, 87.149048%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(96.174622%, 87.478638%, 87.478638%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(96.27533%, 87.809753%, 87.809753%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(96.376038%, 88.140869%, 88.140869%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(96.478271%, 88.471985%, 88.471985%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(96.578979%, 88.803101%, 88.803101%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(96.679688%, 89.13269%, 89.13269%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(96.780396%, 89.463806%, 89.463806%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(96.882629%, 89.794922%, 89.794922%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(96.983337%, 90.126038%, 90.126038%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(97.084045%, 90.457153%, 90.457153%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(97.184753%, 90.788269%, 90.788269%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(97.286987%, 91.117859%, 91.117859%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(97.387695%, 91.448975%, 91.448975%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(97.488403%, 91.78009%, 91.78009%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(97.589111%, 92.111206%, 92.111206%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(97.691345%, 92.442322%, 92.442322%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(97.792053%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-4">
-<path clip-rule="nonzero" d="M 99.664062 125.128906 L 145.019531 125.128906 L 145.019531 147.808594 L 99.664062 147.808594 Z M 99.664062 125.128906 "/>
-</clipPath>
-<linearGradient id="linear-pattern-4" gradientUnits="userSpaceOnUse" x1="0" y1="25.002778" x2="0" y2="74.99731" gradientTransform="matrix(0.90721, 0, 0, -0.4536, 76.97972, 159.14843)">
-<stop offset="0" stop-color="rgb(75.686646%, 83.529663%, 96.076965%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(75.828552%, 83.625793%, 96.099854%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(76.113892%, 83.81958%, 96.14563%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(76.399231%, 84.011841%, 96.192932%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(76.683044%, 84.205627%, 96.238708%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(76.968384%, 84.397888%, 96.284485%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(77.253723%, 84.590149%, 96.330261%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(77.537537%, 84.783936%, 96.376038%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(77.822876%, 84.976196%, 96.421814%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(78.108215%, 85.169983%, 96.46759%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(78.393555%, 85.362244%, 96.514893%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(78.677368%, 85.55603%, 96.560669%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(78.962708%, 85.748291%, 96.606445%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(79.248047%, 85.942078%, 96.652222%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(79.53186%, 86.134338%, 96.697998%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(79.8172%, 86.328125%, 96.743774%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(80.102539%, 86.520386%, 96.789551%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(80.386353%, 86.714172%, 96.836853%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(80.671692%, 86.906433%, 96.882629%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(80.957031%, 87.10022%, 96.928406%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(81.240845%, 87.29248%, 96.974182%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(81.526184%, 87.484741%, 97.019958%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(81.811523%, 87.678528%, 97.065735%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(82.096863%, 87.870789%, 97.111511%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(82.380676%, 88.064575%, 97.158813%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(82.666016%, 88.256836%, 97.20459%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(82.951355%, 88.450623%, 97.250366%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(83.235168%, 88.642883%, 97.296143%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(83.520508%, 88.83667%, 97.341919%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(83.805847%, 89.028931%, 97.387695%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(84.089661%, 89.222717%, 97.433472%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(84.375%, 89.414978%, 97.480774%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(84.660339%, 89.608765%, 97.52655%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(84.945679%, 89.801025%, 97.572327%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(85.229492%, 89.994812%, 97.618103%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(85.514832%, 90.187073%, 97.663879%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(85.800171%, 90.380859%, 97.709656%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(86.083984%, 90.57312%, 97.755432%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(86.369324%, 90.765381%, 97.801208%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(86.654663%, 90.959167%, 97.846985%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(86.940002%, 91.151428%, 97.892761%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(87.223816%, 91.345215%, 97.938538%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(87.509155%, 91.537476%, 97.98584%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(87.794495%, 91.731262%, 98.031616%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(88.078308%, 91.923523%, 98.077393%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(88.363647%, 92.11731%, 98.123169%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(88.648987%, 92.30957%, 98.168945%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(88.934326%, 92.503357%, 98.214722%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(89.21814%, 92.695618%, 98.260498%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(89.503479%, 92.889404%, 98.306274%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(89.788818%, 93.081665%, 98.352051%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(90.072632%, 93.275452%, 98.397827%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(90.357971%, 93.467712%, 98.443604%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(90.643311%, 93.661499%, 98.48938%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(90.927124%, 93.85376%, 98.536682%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(91.212463%, 94.046021%, 98.582458%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(91.497803%, 94.239807%, 98.628235%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(91.783142%, 94.432068%, 98.674011%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(92.066956%, 94.625854%, 98.719788%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(92.352295%, 94.818115%, 98.765564%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(92.637634%, 95.011902%, 98.81134%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(92.921448%, 95.204163%, 98.857117%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(93.206787%, 95.397949%, 98.902893%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(93.492126%, 95.59021%, 98.948669%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(93.777466%, 95.783997%, 98.994446%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-5">
-<path clip-rule="nonzero" d="M 190.371094 23.082031 L 247.066406 23.082031 L 247.066406 45.757812 L 190.371094 45.757812 Z M 190.371094 23.082031 "/>
-</clipPath>
-<linearGradient id="linear-pattern-5" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 57.09974)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.396912%, 83.396912%, 83.396912%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-6">
-<path clip-rule="nonzero" d="M 190.371094 57.097656 L 247.066406 57.097656 L 247.066406 79.773438 L 190.371094 79.773438 Z M 190.371094 57.097656 "/>
-</clipPath>
-<linearGradient id="linear-pattern-6" gradientUnits="userSpaceOnUse" x1="0" y1="25.002756" x2="0" y2="74.997288" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 91.11583)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.396912%, 83.396912%, 83.396912%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-7">
-<path clip-rule="nonzero" d="M 190.371094 91.113281 L 247.066406 91.113281 L 247.066406 113.792969 L 190.371094 113.792969 Z M 190.371094 91.113281 "/>
-</clipPath>
-<linearGradient id="linear-pattern-7" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 125.13233)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.396912%, 83.396912%, 83.396912%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-8">
-<path clip-rule="nonzero" d="M 190.371094 125.128906 L 247.066406 125.128906 L 247.066406 147.808594 L 190.371094 147.808594 Z M 190.371094 125.128906 "/>
-</clipPath>
-<linearGradient id="linear-pattern-8" gradientUnits="userSpaceOnUse" x1="0" y1="25.002778" x2="0" y2="74.99731" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 159.14843)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.398438%, 83.398438%, 83.398438%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-9">
-<path clip-rule="nonzero" d="M 190.371094 159.144531 L 247.066406 159.144531 L 247.066406 181.824219 L 190.371094 181.824219 Z M 190.371094 159.144531 "/>
-</clipPath>
-<linearGradient id="linear-pattern-9" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997244" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 193.16448)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.396912%, 83.396912%, 83.396912%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-10">
-<path clip-rule="nonzero" d="M 190.371094 193.160156 L 247.066406 193.160156 L 247.066406 215.839844 L 190.371094 215.839844 Z M 190.371094 193.160156 "/>
-</clipPath>
-<linearGradient id="linear-pattern-10" gradientUnits="userSpaceOnUse" x1="0" y1="25.002712" x2="0" y2="74.997244" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 227.18055)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.396912%, 83.396912%, 83.396912%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-11">
-<path clip-rule="nonzero" d="M 190.371094 227.179688 L 247.066406 227.179688 L 247.066406 249.855469 L 190.371094 249.855469 Z M 190.371094 227.179688 "/>
-</clipPath>
-<linearGradient id="linear-pattern-11" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(1.134, 0, 0, -0.4536, 162.0192, 261.19664)">
-<stop offset="0" stop-color="rgb(80.000305%, 80.000305%, 80.000305%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(80.116272%, 80.116272%, 80.116272%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(80.351257%, 80.351257%, 80.351257%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(80.586243%, 80.586243%, 80.586243%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(80.819702%, 80.819702%, 80.819702%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(81.054688%, 81.054688%, 81.054688%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(81.288147%, 81.288147%, 81.288147%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(81.523132%, 81.523132%, 81.523132%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(81.756592%, 81.756592%, 81.756592%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(81.991577%, 81.991577%, 81.991577%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(82.226562%, 82.226562%, 82.226562%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(82.460022%, 82.460022%, 82.460022%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(82.695007%, 82.695007%, 82.695007%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(82.928467%, 82.928467%, 82.928467%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(83.163452%, 83.163452%, 83.163452%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(83.396912%, 83.396912%, 83.396912%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(83.631897%, 83.631897%, 83.631897%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(83.866882%, 83.866882%, 83.866882%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(84.100342%, 84.100342%, 84.100342%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(84.335327%, 84.335327%, 84.335327%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(84.568787%, 84.568787%, 84.568787%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(84.803772%, 84.803772%, 84.803772%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(85.038757%, 85.038757%, 85.038757%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(85.272217%, 85.272217%, 85.272217%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(85.507202%, 85.507202%, 85.507202%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(85.740662%, 85.740662%, 85.740662%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(85.975647%, 85.975647%, 85.975647%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(86.209106%, 86.209106%, 86.209106%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(86.444092%, 86.444092%, 86.444092%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(86.679077%, 86.679077%, 86.679077%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(86.912537%, 86.912537%, 86.912537%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(87.147522%, 87.147522%, 87.147522%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(87.380981%, 87.380981%, 87.380981%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(87.615967%, 87.615967%, 87.615967%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(87.850952%, 87.850952%, 87.850952%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(88.084412%, 88.084412%, 88.084412%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(88.319397%, 88.319397%, 88.319397%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(88.552856%, 88.552856%, 88.552856%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(88.787842%, 88.787842%, 88.787842%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(89.021301%, 89.021301%, 89.021301%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(89.256287%, 89.256287%, 89.256287%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(89.491272%, 89.491272%, 89.491272%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(89.724731%, 89.724731%, 89.724731%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(89.959717%, 89.959717%, 89.959717%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(90.193176%, 90.193176%, 90.193176%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(90.428162%, 90.428162%, 90.428162%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(90.663147%, 90.663147%, 90.663147%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(90.896606%, 90.896606%, 90.896606%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(91.131592%, 91.131592%, 91.131592%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(91.365051%, 91.365051%, 91.365051%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(91.600037%, 91.600037%, 91.600037%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(91.833496%, 91.833496%, 91.833496%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(92.068481%, 92.068481%, 92.068481%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(92.303467%, 92.303467%, 92.303467%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(92.536926%, 92.536926%, 92.536926%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(92.771912%, 92.771912%, 92.771912%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(93.005371%, 93.005371%, 93.005371%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(93.240356%, 93.240356%, 93.240356%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(93.473816%, 93.473816%, 93.473816%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(93.708801%, 93.708801%, 93.708801%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(93.943787%, 93.943787%, 93.943787%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(94.177246%, 94.177246%, 94.177246%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(94.412231%, 94.412231%, 94.412231%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(94.645691%, 94.645691%, 94.645691%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(94.880676%, 94.880676%, 94.880676%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-12">
-<path clip-rule="nonzero" d="M 190 226 L 248 226 L 248 250.054688 L 190 250.054688 Z M 190 226 "/>
-</clipPath>
-<clipPath id="clip-13">
-<path clip-rule="nonzero" d="M 281.082031 46 L 349.113281 46 L 349.113281 68 L 281.082031 68 Z M 281.082031 46 "/>
-</clipPath>
-<linearGradient id="linear-pattern-12" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(1.36081, 0, 0, -0.4536, 247.0581, 79.77726)">
-<stop offset="0" stop-color="rgb(57.649231%, 87.059021%, 57.649231%)" stop-opacity="1"/>
-<stop offset="0.0078125" stop-color="rgb(57.759094%, 87.09259%, 57.759094%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(57.978821%, 87.159729%, 57.978821%)" stop-opacity="1"/>
-<stop offset="0.0234375" stop-color="rgb(58.200073%, 87.226868%, 58.200073%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(58.4198%, 87.294006%, 58.4198%)" stop-opacity="1"/>
-<stop offset="0.0390625" stop-color="rgb(58.641052%, 87.361145%, 58.641052%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(58.860779%, 87.42981%, 58.860779%)" stop-opacity="1"/>
-<stop offset="0.0546875" stop-color="rgb(59.082031%, 87.496948%, 59.082031%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(59.301758%, 87.564087%, 59.301758%)" stop-opacity="1"/>
-<stop offset="0.0703125" stop-color="rgb(59.52301%, 87.631226%, 59.52301%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(59.742737%, 87.698364%, 59.742737%)" stop-opacity="1"/>
-<stop offset="0.0859375" stop-color="rgb(59.963989%, 87.765503%, 59.963989%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(60.185242%, 87.834167%, 60.185242%)" stop-opacity="1"/>
-<stop offset="0.101562" stop-color="rgb(60.404968%, 87.901306%, 60.404968%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(60.626221%, 87.968445%, 60.626221%)" stop-opacity="1"/>
-<stop offset="0.117188" stop-color="rgb(60.845947%, 88.035583%, 60.845947%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(61.0672%, 88.102722%, 61.0672%)" stop-opacity="1"/>
-<stop offset="0.132813" stop-color="rgb(61.286926%, 88.169861%, 61.286926%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(61.508179%, 88.238525%, 61.508179%)" stop-opacity="1"/>
-<stop offset="0.148438" stop-color="rgb(61.727905%, 88.305664%, 61.727905%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(61.949158%, 88.372803%, 61.949158%)" stop-opacity="1"/>
-<stop offset="0.164063" stop-color="rgb(62.168884%, 88.439941%, 62.168884%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(62.390137%, 88.50708%, 62.390137%)" stop-opacity="1"/>
-<stop offset="0.179688" stop-color="rgb(62.609863%, 88.574219%, 62.609863%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(62.831116%, 88.642883%, 62.831116%)" stop-opacity="1"/>
-<stop offset="0.195312" stop-color="rgb(63.052368%, 88.710022%, 63.052368%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(63.272095%, 88.777161%, 63.272095%)" stop-opacity="1"/>
-<stop offset="0.210938" stop-color="rgb(63.493347%, 88.844299%, 63.493347%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(63.713074%, 88.911438%, 63.713074%)" stop-opacity="1"/>
-<stop offset="0.226563" stop-color="rgb(63.934326%, 88.980103%, 63.934326%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(64.154053%, 89.047241%, 64.154053%)" stop-opacity="1"/>
-<stop offset="0.242188" stop-color="rgb(64.375305%, 89.11438%, 64.375305%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(64.595032%, 89.181519%, 64.595032%)" stop-opacity="1"/>
-<stop offset="0.257813" stop-color="rgb(64.816284%, 89.248657%, 64.816284%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(65.036011%, 89.315796%, 65.036011%)" stop-opacity="1"/>
-<stop offset="0.273438" stop-color="rgb(65.257263%, 89.38446%, 65.257263%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(65.478516%, 89.451599%, 65.478516%)" stop-opacity="1"/>
-<stop offset="0.289062" stop-color="rgb(65.698242%, 89.518738%, 65.698242%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(65.919495%, 89.585876%, 65.919495%)" stop-opacity="1"/>
-<stop offset="0.304688" stop-color="rgb(66.139221%, 89.653015%, 66.139221%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(66.360474%, 89.720154%, 66.360474%)" stop-opacity="1"/>
-<stop offset="0.320313" stop-color="rgb(66.5802%, 89.788818%, 66.5802%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(66.801453%, 89.855957%, 66.801453%)" stop-opacity="1"/>
-<stop offset="0.335938" stop-color="rgb(67.021179%, 89.923096%, 67.021179%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(67.242432%, 89.990234%, 67.242432%)" stop-opacity="1"/>
-<stop offset="0.351563" stop-color="rgb(67.462158%, 90.057373%, 67.462158%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(67.683411%, 90.124512%, 67.683411%)" stop-opacity="1"/>
-<stop offset="0.367188" stop-color="rgb(67.903137%, 90.193176%, 67.903137%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(68.12439%, 90.260315%, 68.12439%)" stop-opacity="1"/>
-<stop offset="0.382813" stop-color="rgb(68.345642%, 90.327454%, 68.345642%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(68.565369%, 90.394592%, 68.565369%)" stop-opacity="1"/>
-<stop offset="0.398438" stop-color="rgb(68.786621%, 90.461731%, 68.786621%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(69.006348%, 90.52887%, 69.006348%)" stop-opacity="1"/>
-<stop offset="0.414063" stop-color="rgb(69.2276%, 90.597534%, 69.2276%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(69.447327%, 90.664673%, 69.447327%)" stop-opacity="1"/>
-<stop offset="0.429688" stop-color="rgb(69.668579%, 90.731812%, 69.668579%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(69.888306%, 90.79895%, 69.888306%)" stop-opacity="1"/>
-<stop offset="0.445313" stop-color="rgb(70.109558%, 90.866089%, 70.109558%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(70.329285%, 90.933228%, 70.329285%)" stop-opacity="1"/>
-<stop offset="0.460938" stop-color="rgb(70.550537%, 91.001892%, 70.550537%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(70.770264%, 91.069031%, 70.770264%)" stop-opacity="1"/>
-<stop offset="0.476563" stop-color="rgb(70.991516%, 91.136169%, 70.991516%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(71.212769%, 91.203308%, 71.212769%)" stop-opacity="1"/>
-<stop offset="0.492188" stop-color="rgb(71.432495%, 91.270447%, 71.432495%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(71.653748%, 91.337585%, 71.653748%)" stop-opacity="1"/>
-<stop offset="0.507812" stop-color="rgb(71.873474%, 91.40625%, 71.873474%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(72.094727%, 91.473389%, 72.094727%)" stop-opacity="1"/>
-<stop offset="0.523438" stop-color="rgb(72.314453%, 91.540527%, 72.314453%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(72.535706%, 91.607666%, 72.535706%)" stop-opacity="1"/>
-<stop offset="0.539062" stop-color="rgb(72.755432%, 91.674805%, 72.755432%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(72.976685%, 91.741943%, 72.976685%)" stop-opacity="1"/>
-<stop offset="0.554687" stop-color="rgb(73.196411%, 91.810608%, 73.196411%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(73.417664%, 91.877747%, 73.417664%)" stop-opacity="1"/>
-<stop offset="0.570313" stop-color="rgb(73.638916%, 91.944885%, 73.638916%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(73.858643%, 92.012024%, 73.858643%)" stop-opacity="1"/>
-<stop offset="0.585938" stop-color="rgb(74.079895%, 92.079163%, 74.079895%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(74.299622%, 92.146301%, 74.299622%)" stop-opacity="1"/>
-<stop offset="0.601562" stop-color="rgb(74.520874%, 92.214966%, 74.520874%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(74.740601%, 92.282104%, 74.740601%)" stop-opacity="1"/>
-<stop offset="0.617188" stop-color="rgb(74.961853%, 92.349243%, 74.961853%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(75.18158%, 92.416382%, 75.18158%)" stop-opacity="1"/>
-<stop offset="0.632812" stop-color="rgb(75.402832%, 92.483521%, 75.402832%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(75.624084%, 92.550659%, 75.624084%)" stop-opacity="1"/>
-<stop offset="0.648437" stop-color="rgb(75.843811%, 92.619324%, 75.843811%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(76.065063%, 92.686462%, 76.065063%)" stop-opacity="1"/>
-<stop offset="0.664063" stop-color="rgb(76.28479%, 92.753601%, 76.28479%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(76.506042%, 92.82074%, 76.506042%)" stop-opacity="1"/>
-<stop offset="0.679688" stop-color="rgb(76.725769%, 92.887878%, 76.725769%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(76.947021%, 92.955017%, 76.947021%)" stop-opacity="1"/>
-<stop offset="0.695312" stop-color="rgb(77.166748%, 93.022156%, 77.166748%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(77.388%, 93.09082%, 77.388%)" stop-opacity="1"/>
-<stop offset="0.710938" stop-color="rgb(77.607727%, 93.157959%, 77.607727%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(77.828979%, 93.225098%, 77.828979%)" stop-opacity="1"/>
-<stop offset="0.726562" stop-color="rgb(78.050232%, 93.292236%, 78.050232%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(78.269958%, 93.359375%, 78.269958%)" stop-opacity="1"/>
-<stop offset="0.742188" stop-color="rgb(78.491211%, 93.426514%, 78.491211%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(78.710938%, 93.495178%, 78.710938%)" stop-opacity="1"/>
-<stop offset="0.757813" stop-color="rgb(78.93219%, 93.562317%, 78.93219%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(79.151917%, 93.629456%, 79.151917%)" stop-opacity="1"/>
-<stop offset="0.773438" stop-color="rgb(79.373169%, 93.696594%, 79.373169%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(79.592896%, 93.763733%, 79.592896%)" stop-opacity="1"/>
-<stop offset="0.789062" stop-color="rgb(79.814148%, 93.830872%, 79.814148%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(80.0354%, 93.899536%, 80.0354%)" stop-opacity="1"/>
-<stop offset="0.804688" stop-color="rgb(80.255127%, 93.966675%, 80.255127%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(80.476379%, 94.033813%, 80.476379%)" stop-opacity="1"/>
-<stop offset="0.820312" stop-color="rgb(80.696106%, 94.100952%, 80.696106%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(80.917358%, 94.168091%, 80.917358%)" stop-opacity="1"/>
-<stop offset="0.835938" stop-color="rgb(81.137085%, 94.235229%, 81.137085%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(81.358337%, 94.303894%, 81.358337%)" stop-opacity="1"/>
-<stop offset="0.851563" stop-color="rgb(81.578064%, 94.371033%, 81.578064%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(81.799316%, 94.438171%, 81.799316%)" stop-opacity="1"/>
-<stop offset="0.867188" stop-color="rgb(82.020569%, 94.50531%, 82.020569%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(82.240295%, 94.572449%, 82.240295%)" stop-opacity="1"/>
-<stop offset="0.882812" stop-color="rgb(82.461548%, 94.639587%, 82.461548%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(82.681274%, 94.706726%, 82.681274%)" stop-opacity="1"/>
-<stop offset="0.898438" stop-color="rgb(82.902527%, 94.775391%, 82.902527%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(83.122253%, 94.842529%, 83.122253%)" stop-opacity="1"/>
-<stop offset="0.914063" stop-color="rgb(83.343506%, 94.909668%, 83.343506%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(83.563232%, 94.976807%, 83.563232%)" stop-opacity="1"/>
-<stop offset="0.929688" stop-color="rgb(83.784485%, 95.043945%, 83.784485%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(84.004211%, 95.111084%, 84.004211%)" stop-opacity="1"/>
-<stop offset="0.945313" stop-color="rgb(84.225464%, 95.179749%, 84.225464%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(84.446716%, 95.246887%, 84.446716%)" stop-opacity="1"/>
-<stop offset="0.960938" stop-color="rgb(84.666443%, 95.314026%, 84.666443%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(84.887695%, 95.381165%, 84.887695%)" stop-opacity="1"/>
-<stop offset="0.976562" stop-color="rgb(85.107422%, 95.448303%, 85.107422%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(85.328674%, 95.515442%, 85.328674%)" stop-opacity="1"/>
-<stop offset="0.992188" stop-color="rgb(85.548401%, 95.584106%, 85.548401%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(85.769653%, 95.651245%, 85.769653%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-14">
-<path clip-rule="nonzero" d="M 280 45 L 349.308594 45 L 349.308594 69 L 280 69 Z M 280 45 "/>
-</clipPath>
-<clipPath id="clip-15">
-<path clip-rule="nonzero" d="M 281.082031 117 L 349.113281 117 L 349.113281 139 L 281.082031 139 Z M 281.082031 117 "/>
-</clipPath>
-<linearGradient id="linear-pattern-13" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(1.36081, 0, 0, -0.4536, 247.0581, 150.64429)">
-<stop offset="0" stop-color="rgb(57.649231%, 87.059021%, 57.649231%)" stop-opacity="1"/>
-<stop offset="0.0078125" stop-color="rgb(57.759094%, 87.09259%, 57.759094%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(57.978821%, 87.159729%, 57.978821%)" stop-opacity="1"/>
-<stop offset="0.0234375" stop-color="rgb(58.200073%, 87.226868%, 58.200073%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(58.4198%, 87.294006%, 58.4198%)" stop-opacity="1"/>
-<stop offset="0.0390625" stop-color="rgb(58.641052%, 87.361145%, 58.641052%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(58.860779%, 87.42981%, 58.860779%)" stop-opacity="1"/>
-<stop offset="0.0546875" stop-color="rgb(59.082031%, 87.496948%, 59.082031%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(59.301758%, 87.564087%, 59.301758%)" stop-opacity="1"/>
-<stop offset="0.0703125" stop-color="rgb(59.52301%, 87.631226%, 59.52301%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(59.742737%, 87.698364%, 59.742737%)" stop-opacity="1"/>
-<stop offset="0.0859375" stop-color="rgb(59.963989%, 87.765503%, 59.963989%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(60.185242%, 87.834167%, 60.185242%)" stop-opacity="1"/>
-<stop offset="0.101562" stop-color="rgb(60.404968%, 87.901306%, 60.404968%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(60.626221%, 87.968445%, 60.626221%)" stop-opacity="1"/>
-<stop offset="0.117188" stop-color="rgb(60.845947%, 88.035583%, 60.845947%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(61.0672%, 88.102722%, 61.0672%)" stop-opacity="1"/>
-<stop offset="0.132812" stop-color="rgb(61.286926%, 88.169861%, 61.286926%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(61.508179%, 88.238525%, 61.508179%)" stop-opacity="1"/>
-<stop offset="0.148438" stop-color="rgb(61.727905%, 88.305664%, 61.727905%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(61.949158%, 88.372803%, 61.949158%)" stop-opacity="1"/>
-<stop offset="0.164062" stop-color="rgb(62.168884%, 88.439941%, 62.168884%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(62.390137%, 88.50708%, 62.390137%)" stop-opacity="1"/>
-<stop offset="0.179688" stop-color="rgb(62.609863%, 88.574219%, 62.609863%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(62.831116%, 88.642883%, 62.831116%)" stop-opacity="1"/>
-<stop offset="0.195312" stop-color="rgb(63.052368%, 88.710022%, 63.052368%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(63.272095%, 88.777161%, 63.272095%)" stop-opacity="1"/>
-<stop offset="0.210937" stop-color="rgb(63.493347%, 88.844299%, 63.493347%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(63.713074%, 88.911438%, 63.713074%)" stop-opacity="1"/>
-<stop offset="0.226562" stop-color="rgb(63.934326%, 88.980103%, 63.934326%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(64.154053%, 89.047241%, 64.154053%)" stop-opacity="1"/>
-<stop offset="0.242187" stop-color="rgb(64.375305%, 89.11438%, 64.375305%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(64.595032%, 89.181519%, 64.595032%)" stop-opacity="1"/>
-<stop offset="0.257812" stop-color="rgb(64.816284%, 89.248657%, 64.816284%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(65.036011%, 89.315796%, 65.036011%)" stop-opacity="1"/>
-<stop offset="0.273437" stop-color="rgb(65.257263%, 89.38446%, 65.257263%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(65.478516%, 89.451599%, 65.478516%)" stop-opacity="1"/>
-<stop offset="0.289062" stop-color="rgb(65.698242%, 89.518738%, 65.698242%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(65.919495%, 89.585876%, 65.919495%)" stop-opacity="1"/>
-<stop offset="0.304687" stop-color="rgb(66.139221%, 89.653015%, 66.139221%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(66.360474%, 89.720154%, 66.360474%)" stop-opacity="1"/>
-<stop offset="0.320312" stop-color="rgb(66.5802%, 89.788818%, 66.5802%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(66.801453%, 89.855957%, 66.801453%)" stop-opacity="1"/>
-<stop offset="0.335937" stop-color="rgb(67.021179%, 89.923096%, 67.021179%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(67.242432%, 89.990234%, 67.242432%)" stop-opacity="1"/>
-<stop offset="0.351562" stop-color="rgb(67.462158%, 90.057373%, 67.462158%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(67.683411%, 90.124512%, 67.683411%)" stop-opacity="1"/>
-<stop offset="0.367187" stop-color="rgb(67.903137%, 90.193176%, 67.903137%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(68.12439%, 90.260315%, 68.12439%)" stop-opacity="1"/>
-<stop offset="0.382812" stop-color="rgb(68.345642%, 90.327454%, 68.345642%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(68.565369%, 90.394592%, 68.565369%)" stop-opacity="1"/>
-<stop offset="0.398437" stop-color="rgb(68.786621%, 90.461731%, 68.786621%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(69.006348%, 90.52887%, 69.006348%)" stop-opacity="1"/>
-<stop offset="0.414062" stop-color="rgb(69.2276%, 90.597534%, 69.2276%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(69.447327%, 90.664673%, 69.447327%)" stop-opacity="1"/>
-<stop offset="0.429687" stop-color="rgb(69.668579%, 90.731812%, 69.668579%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(69.888306%, 90.79895%, 69.888306%)" stop-opacity="1"/>
-<stop offset="0.445312" stop-color="rgb(70.109558%, 90.866089%, 70.109558%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(70.329285%, 90.933228%, 70.329285%)" stop-opacity="1"/>
-<stop offset="0.460937" stop-color="rgb(70.550537%, 91.001892%, 70.550537%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(70.770264%, 91.069031%, 70.770264%)" stop-opacity="1"/>
-<stop offset="0.476562" stop-color="rgb(70.991516%, 91.136169%, 70.991516%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(71.212769%, 91.203308%, 71.212769%)" stop-opacity="1"/>
-<stop offset="0.492188" stop-color="rgb(71.432495%, 91.270447%, 71.432495%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(71.653748%, 91.337585%, 71.653748%)" stop-opacity="1"/>
-<stop offset="0.507812" stop-color="rgb(71.873474%, 91.40625%, 71.873474%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(72.094727%, 91.473389%, 72.094727%)" stop-opacity="1"/>
-<stop offset="0.523437" stop-color="rgb(72.314453%, 91.540527%, 72.314453%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(72.535706%, 91.607666%, 72.535706%)" stop-opacity="1"/>
-<stop offset="0.539063" stop-color="rgb(72.755432%, 91.674805%, 72.755432%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(72.976685%, 91.741943%, 72.976685%)" stop-opacity="1"/>
-<stop offset="0.554688" stop-color="rgb(73.196411%, 91.810608%, 73.196411%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(73.417664%, 91.877747%, 73.417664%)" stop-opacity="1"/>
-<stop offset="0.570312" stop-color="rgb(73.638916%, 91.944885%, 73.638916%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(73.858643%, 92.012024%, 73.858643%)" stop-opacity="1"/>
-<stop offset="0.585938" stop-color="rgb(74.079895%, 92.079163%, 74.079895%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(74.299622%, 92.146301%, 74.299622%)" stop-opacity="1"/>
-<stop offset="0.601563" stop-color="rgb(74.520874%, 92.214966%, 74.520874%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(74.740601%, 92.282104%, 74.740601%)" stop-opacity="1"/>
-<stop offset="0.617188" stop-color="rgb(74.961853%, 92.349243%, 74.961853%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(75.18158%, 92.416382%, 75.18158%)" stop-opacity="1"/>
-<stop offset="0.632813" stop-color="rgb(75.402832%, 92.483521%, 75.402832%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(75.624084%, 92.550659%, 75.624084%)" stop-opacity="1"/>
-<stop offset="0.648438" stop-color="rgb(75.843811%, 92.619324%, 75.843811%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(76.065063%, 92.686462%, 76.065063%)" stop-opacity="1"/>
-<stop offset="0.664062" stop-color="rgb(76.28479%, 92.753601%, 76.28479%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(76.506042%, 92.82074%, 76.506042%)" stop-opacity="1"/>
-<stop offset="0.679688" stop-color="rgb(76.725769%, 92.887878%, 76.725769%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(76.947021%, 92.955017%, 76.947021%)" stop-opacity="1"/>
-<stop offset="0.695312" stop-color="rgb(77.166748%, 93.022156%, 77.166748%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(77.388%, 93.09082%, 77.388%)" stop-opacity="1"/>
-<stop offset="0.710938" stop-color="rgb(77.607727%, 93.157959%, 77.607727%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(77.828979%, 93.225098%, 77.828979%)" stop-opacity="1"/>
-<stop offset="0.726562" stop-color="rgb(78.050232%, 93.292236%, 78.050232%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(78.269958%, 93.359375%, 78.269958%)" stop-opacity="1"/>
-<stop offset="0.742188" stop-color="rgb(78.491211%, 93.426514%, 78.491211%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(78.710938%, 93.495178%, 78.710938%)" stop-opacity="1"/>
-<stop offset="0.757813" stop-color="rgb(78.93219%, 93.562317%, 78.93219%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(79.151917%, 93.629456%, 79.151917%)" stop-opacity="1"/>
-<stop offset="0.773438" stop-color="rgb(79.373169%, 93.696594%, 79.373169%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(79.592896%, 93.763733%, 79.592896%)" stop-opacity="1"/>
-<stop offset="0.789062" stop-color="rgb(79.814148%, 93.830872%, 79.814148%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(80.0354%, 93.899536%, 80.0354%)" stop-opacity="1"/>
-<stop offset="0.804688" stop-color="rgb(80.255127%, 93.966675%, 80.255127%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(80.476379%, 94.033813%, 80.476379%)" stop-opacity="1"/>
-<stop offset="0.820312" stop-color="rgb(80.696106%, 94.100952%, 80.696106%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(80.917358%, 94.168091%, 80.917358%)" stop-opacity="1"/>
-<stop offset="0.835938" stop-color="rgb(81.137085%, 94.235229%, 81.137085%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(81.358337%, 94.303894%, 81.358337%)" stop-opacity="1"/>
-<stop offset="0.851562" stop-color="rgb(81.578064%, 94.371033%, 81.578064%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(81.799316%, 94.438171%, 81.799316%)" stop-opacity="1"/>
-<stop offset="0.867188" stop-color="rgb(82.020569%, 94.50531%, 82.020569%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(82.240295%, 94.572449%, 82.240295%)" stop-opacity="1"/>
-<stop offset="0.882813" stop-color="rgb(82.461548%, 94.639587%, 82.461548%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(82.681274%, 94.706726%, 82.681274%)" stop-opacity="1"/>
-<stop offset="0.898438" stop-color="rgb(82.902527%, 94.775391%, 82.902527%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(83.122253%, 94.842529%, 83.122253%)" stop-opacity="1"/>
-<stop offset="0.914062" stop-color="rgb(83.343506%, 94.909668%, 83.343506%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(83.563232%, 94.976807%, 83.563232%)" stop-opacity="1"/>
-<stop offset="0.929688" stop-color="rgb(83.784485%, 95.043945%, 83.784485%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(84.004211%, 95.111084%, 84.004211%)" stop-opacity="1"/>
-<stop offset="0.945312" stop-color="rgb(84.225464%, 95.179749%, 84.225464%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(84.446716%, 95.246887%, 84.446716%)" stop-opacity="1"/>
-<stop offset="0.960938" stop-color="rgb(84.666443%, 95.314026%, 84.666443%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(84.887695%, 95.381165%, 84.887695%)" stop-opacity="1"/>
-<stop offset="0.976562" stop-color="rgb(85.107422%, 95.448303%, 85.107422%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(85.328674%, 95.515442%, 85.328674%)" stop-opacity="1"/>
-<stop offset="0.992187" stop-color="rgb(85.548401%, 95.584106%, 85.548401%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(85.769653%, 95.651245%, 85.769653%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-16">
-<path clip-rule="nonzero" d="M 280 116 L 349.308594 116 L 349.308594 140 L 280 140 Z M 280 116 "/>
-</clipPath>
-<clipPath id="clip-17">
-<path clip-rule="nonzero" d="M 281.082031 159.144531 L 349.113281 159.144531 L 349.113281 181.824219 L 281.082031 181.824219 Z M 281.082031 159.144531 "/>
-</clipPath>
-<linearGradient id="linear-pattern-14" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997244" gradientTransform="matrix(1.36081, 0, 0, -0.4536, 247.0581, 193.16448)">
-<stop offset="0" stop-color="rgb(57.649231%, 87.059021%, 57.649231%)" stop-opacity="1"/>
-<stop offset="0.0078125" stop-color="rgb(57.759094%, 87.09259%, 57.759094%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(57.978821%, 87.159729%, 57.978821%)" stop-opacity="1"/>
-<stop offset="0.0234375" stop-color="rgb(58.200073%, 87.226868%, 58.200073%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(58.4198%, 87.294006%, 58.4198%)" stop-opacity="1"/>
-<stop offset="0.0390625" stop-color="rgb(58.641052%, 87.361145%, 58.641052%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(58.860779%, 87.42981%, 58.860779%)" stop-opacity="1"/>
-<stop offset="0.0546875" stop-color="rgb(59.082031%, 87.496948%, 59.082031%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(59.301758%, 87.564087%, 59.301758%)" stop-opacity="1"/>
-<stop offset="0.0703125" stop-color="rgb(59.52301%, 87.631226%, 59.52301%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(59.742737%, 87.698364%, 59.742737%)" stop-opacity="1"/>
-<stop offset="0.0859375" stop-color="rgb(59.963989%, 87.765503%, 59.963989%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(60.185242%, 87.834167%, 60.185242%)" stop-opacity="1"/>
-<stop offset="0.101563" stop-color="rgb(60.404968%, 87.901306%, 60.404968%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(60.626221%, 87.968445%, 60.626221%)" stop-opacity="1"/>
-<stop offset="0.117187" stop-color="rgb(60.845947%, 88.035583%, 60.845947%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(61.0672%, 88.102722%, 61.0672%)" stop-opacity="1"/>
-<stop offset="0.132813" stop-color="rgb(61.286926%, 88.169861%, 61.286926%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(61.508179%, 88.238525%, 61.508179%)" stop-opacity="1"/>
-<stop offset="0.148438" stop-color="rgb(61.727905%, 88.305664%, 61.727905%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(61.949158%, 88.372803%, 61.949158%)" stop-opacity="1"/>
-<stop offset="0.164063" stop-color="rgb(62.168884%, 88.439941%, 62.168884%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(62.390137%, 88.50708%, 62.390137%)" stop-opacity="1"/>
-<stop offset="0.179688" stop-color="rgb(62.609863%, 88.574219%, 62.609863%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(62.831116%, 88.642883%, 62.831116%)" stop-opacity="1"/>
-<stop offset="0.195312" stop-color="rgb(63.052368%, 88.710022%, 63.052368%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(63.272095%, 88.777161%, 63.272095%)" stop-opacity="1"/>
-<stop offset="0.210938" stop-color="rgb(63.493347%, 88.844299%, 63.493347%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(63.713074%, 88.911438%, 63.713074%)" stop-opacity="1"/>
-<stop offset="0.226563" stop-color="rgb(63.934326%, 88.980103%, 63.934326%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(64.154053%, 89.047241%, 64.154053%)" stop-opacity="1"/>
-<stop offset="0.242187" stop-color="rgb(64.375305%, 89.11438%, 64.375305%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(64.595032%, 89.181519%, 64.595032%)" stop-opacity="1"/>
-<stop offset="0.257813" stop-color="rgb(64.816284%, 89.248657%, 64.816284%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(65.036011%, 89.315796%, 65.036011%)" stop-opacity="1"/>
-<stop offset="0.273438" stop-color="rgb(65.257263%, 89.38446%, 65.257263%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(65.47699%, 89.451599%, 65.47699%)" stop-opacity="1"/>
-<stop offset="0.289063" stop-color="rgb(65.698242%, 89.518738%, 65.698242%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(65.919495%, 89.585876%, 65.919495%)" stop-opacity="1"/>
-<stop offset="0.304688" stop-color="rgb(66.139221%, 89.653015%, 66.139221%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(66.360474%, 89.720154%, 66.360474%)" stop-opacity="1"/>
-<stop offset="0.320313" stop-color="rgb(66.5802%, 89.788818%, 66.5802%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(66.801453%, 89.855957%, 66.801453%)" stop-opacity="1"/>
-<stop offset="0.335938" stop-color="rgb(67.021179%, 89.923096%, 67.021179%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(67.242432%, 89.990234%, 67.242432%)" stop-opacity="1"/>
-<stop offset="0.351563" stop-color="rgb(67.462158%, 90.057373%, 67.462158%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(67.683411%, 90.124512%, 67.683411%)" stop-opacity="1"/>
-<stop offset="0.367188" stop-color="rgb(67.903137%, 90.193176%, 67.903137%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(68.12439%, 90.260315%, 68.12439%)" stop-opacity="1"/>
-<stop offset="0.382813" stop-color="rgb(68.345642%, 90.327454%, 68.345642%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(68.565369%, 90.394592%, 68.565369%)" stop-opacity="1"/>
-<stop offset="0.398438" stop-color="rgb(68.786621%, 90.461731%, 68.786621%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(69.006348%, 90.52887%, 69.006348%)" stop-opacity="1"/>
-<stop offset="0.414062" stop-color="rgb(69.2276%, 90.597534%, 69.2276%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(69.447327%, 90.664673%, 69.447327%)" stop-opacity="1"/>
-<stop offset="0.429688" stop-color="rgb(69.668579%, 90.731812%, 69.668579%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(69.888306%, 90.79895%, 69.888306%)" stop-opacity="1"/>
-<stop offset="0.445313" stop-color="rgb(70.109558%, 90.866089%, 70.109558%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(70.329285%, 90.933228%, 70.329285%)" stop-opacity="1"/>
-<stop offset="0.460938" stop-color="rgb(70.550537%, 91.001892%, 70.550537%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(70.770264%, 91.069031%, 70.770264%)" stop-opacity="1"/>
-<stop offset="0.476562" stop-color="rgb(70.991516%, 91.136169%, 70.991516%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(71.212769%, 91.203308%, 71.212769%)" stop-opacity="1"/>
-<stop offset="0.492188" stop-color="rgb(71.432495%, 91.270447%, 71.432495%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(71.653748%, 91.337585%, 71.653748%)" stop-opacity="1"/>
-<stop offset="0.507812" stop-color="rgb(71.873474%, 91.40625%, 71.873474%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(72.094727%, 91.473389%, 72.094727%)" stop-opacity="1"/>
-<stop offset="0.523437" stop-color="rgb(72.314453%, 91.540527%, 72.314453%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(72.535706%, 91.607666%, 72.535706%)" stop-opacity="1"/>
-<stop offset="0.539062" stop-color="rgb(72.755432%, 91.674805%, 72.755432%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(72.976685%, 91.741943%, 72.976685%)" stop-opacity="1"/>
-<stop offset="0.554688" stop-color="rgb(73.196411%, 91.810608%, 73.196411%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(73.417664%, 91.877747%, 73.417664%)" stop-opacity="1"/>
-<stop offset="0.570312" stop-color="rgb(73.638916%, 91.944885%, 73.638916%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(73.858643%, 92.012024%, 73.858643%)" stop-opacity="1"/>
-<stop offset="0.585938" stop-color="rgb(74.079895%, 92.079163%, 74.079895%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(74.299622%, 92.146301%, 74.299622%)" stop-opacity="1"/>
-<stop offset="0.601563" stop-color="rgb(74.520874%, 92.214966%, 74.520874%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(74.740601%, 92.282104%, 74.740601%)" stop-opacity="1"/>
-<stop offset="0.617187" stop-color="rgb(74.961853%, 92.349243%, 74.961853%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(75.18158%, 92.416382%, 75.18158%)" stop-opacity="1"/>
-<stop offset="0.632812" stop-color="rgb(75.402832%, 92.483521%, 75.402832%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(75.624084%, 92.550659%, 75.624084%)" stop-opacity="1"/>
-<stop offset="0.648438" stop-color="rgb(75.843811%, 92.619324%, 75.843811%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(76.065063%, 92.686462%, 76.065063%)" stop-opacity="1"/>
-<stop offset="0.664062" stop-color="rgb(76.28479%, 92.753601%, 76.28479%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(76.506042%, 92.82074%, 76.506042%)" stop-opacity="1"/>
-<stop offset="0.679688" stop-color="rgb(76.725769%, 92.887878%, 76.725769%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(76.947021%, 92.955017%, 76.947021%)" stop-opacity="1"/>
-<stop offset="0.695313" stop-color="rgb(77.166748%, 93.022156%, 77.166748%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(77.388%, 93.09082%, 77.388%)" stop-opacity="1"/>
-<stop offset="0.710937" stop-color="rgb(77.607727%, 93.157959%, 77.607727%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(77.828979%, 93.225098%, 77.828979%)" stop-opacity="1"/>
-<stop offset="0.726562" stop-color="rgb(78.050232%, 93.292236%, 78.050232%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(78.269958%, 93.359375%, 78.269958%)" stop-opacity="1"/>
-<stop offset="0.742188" stop-color="rgb(78.491211%, 93.426514%, 78.491211%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(78.710938%, 93.495178%, 78.710938%)" stop-opacity="1"/>
-<stop offset="0.757812" stop-color="rgb(78.93219%, 93.562317%, 78.93219%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(79.151917%, 93.629456%, 79.151917%)" stop-opacity="1"/>
-<stop offset="0.773437" stop-color="rgb(79.373169%, 93.696594%, 79.373169%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(79.592896%, 93.763733%, 79.592896%)" stop-opacity="1"/>
-<stop offset="0.789063" stop-color="rgb(79.814148%, 93.830872%, 79.814148%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(80.0354%, 93.899536%, 80.0354%)" stop-opacity="1"/>
-<stop offset="0.804687" stop-color="rgb(80.255127%, 93.966675%, 80.255127%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(80.476379%, 94.033813%, 80.476379%)" stop-opacity="1"/>
-<stop offset="0.820312" stop-color="rgb(80.696106%, 94.100952%, 80.696106%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(80.917358%, 94.168091%, 80.917358%)" stop-opacity="1"/>
-<stop offset="0.835938" stop-color="rgb(81.137085%, 94.235229%, 81.137085%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(81.358337%, 94.303894%, 81.358337%)" stop-opacity="1"/>
-<stop offset="0.851562" stop-color="rgb(81.578064%, 94.371033%, 81.578064%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(81.799316%, 94.438171%, 81.799316%)" stop-opacity="1"/>
-<stop offset="0.867187" stop-color="rgb(82.019043%, 94.50531%, 82.019043%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(82.240295%, 94.572449%, 82.240295%)" stop-opacity="1"/>
-<stop offset="0.882812" stop-color="rgb(82.461548%, 94.639587%, 82.461548%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(82.681274%, 94.706726%, 82.681274%)" stop-opacity="1"/>
-<stop offset="0.898438" stop-color="rgb(82.902527%, 94.775391%, 82.902527%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(83.122253%, 94.842529%, 83.122253%)" stop-opacity="1"/>
-<stop offset="0.914062" stop-color="rgb(83.343506%, 94.909668%, 83.343506%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(83.563232%, 94.976807%, 83.563232%)" stop-opacity="1"/>
-<stop offset="0.929688" stop-color="rgb(83.784485%, 95.043945%, 83.784485%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(84.004211%, 95.111084%, 84.004211%)" stop-opacity="1"/>
-<stop offset="0.945313" stop-color="rgb(84.225464%, 95.179749%, 84.225464%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(84.446716%, 95.246887%, 84.446716%)" stop-opacity="1"/>
-<stop offset="0.960937" stop-color="rgb(84.666443%, 95.314026%, 84.666443%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(84.887695%, 95.381165%, 84.887695%)" stop-opacity="1"/>
-<stop offset="0.976562" stop-color="rgb(85.107422%, 95.448303%, 85.107422%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(85.328674%, 95.515442%, 85.328674%)" stop-opacity="1"/>
-<stop offset="0.992188" stop-color="rgb(85.548401%, 95.584106%, 85.548401%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(85.769653%, 95.651245%, 85.769653%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-18">
-<path clip-rule="nonzero" d="M 280 158 L 349.308594 158 L 349.308594 183 L 280 183 Z M 280 158 "/>
-</clipPath>
-<clipPath id="clip-19">
-<path clip-rule="nonzero" d="M 281.082031 202 L 349.113281 202 L 349.113281 224 L 281.082031 224 Z M 281.082031 202 "/>
-</clipPath>
-<linearGradient id="linear-pattern-15" gradientUnits="userSpaceOnUse" x1="0" y1="25.002734" x2="0" y2="74.997266" gradientTransform="matrix(1.36081, 0, 0, -0.4536, 247.0581, 235.68468)">
-<stop offset="0" stop-color="rgb(57.649231%, 87.059021%, 57.649231%)" stop-opacity="1"/>
-<stop offset="0.0078125" stop-color="rgb(57.759094%, 87.09259%, 57.759094%)" stop-opacity="1"/>
-<stop offset="0.015625" stop-color="rgb(57.978821%, 87.159729%, 57.978821%)" stop-opacity="1"/>
-<stop offset="0.0234375" stop-color="rgb(58.200073%, 87.226868%, 58.200073%)" stop-opacity="1"/>
-<stop offset="0.03125" stop-color="rgb(58.4198%, 87.294006%, 58.4198%)" stop-opacity="1"/>
-<stop offset="0.0390625" stop-color="rgb(58.641052%, 87.361145%, 58.641052%)" stop-opacity="1"/>
-<stop offset="0.046875" stop-color="rgb(58.860779%, 87.42981%, 58.860779%)" stop-opacity="1"/>
-<stop offset="0.0546875" stop-color="rgb(59.082031%, 87.496948%, 59.082031%)" stop-opacity="1"/>
-<stop offset="0.0625" stop-color="rgb(59.301758%, 87.564087%, 59.301758%)" stop-opacity="1"/>
-<stop offset="0.0703125" stop-color="rgb(59.52301%, 87.631226%, 59.52301%)" stop-opacity="1"/>
-<stop offset="0.078125" stop-color="rgb(59.742737%, 87.698364%, 59.742737%)" stop-opacity="1"/>
-<stop offset="0.0859375" stop-color="rgb(59.963989%, 87.765503%, 59.963989%)" stop-opacity="1"/>
-<stop offset="0.09375" stop-color="rgb(60.185242%, 87.834167%, 60.185242%)" stop-opacity="1"/>
-<stop offset="0.101562" stop-color="rgb(60.404968%, 87.901306%, 60.404968%)" stop-opacity="1"/>
-<stop offset="0.109375" stop-color="rgb(60.626221%, 87.968445%, 60.626221%)" stop-opacity="1"/>
-<stop offset="0.117188" stop-color="rgb(60.845947%, 88.035583%, 60.845947%)" stop-opacity="1"/>
-<stop offset="0.125" stop-color="rgb(61.0672%, 88.102722%, 61.0672%)" stop-opacity="1"/>
-<stop offset="0.132812" stop-color="rgb(61.286926%, 88.169861%, 61.286926%)" stop-opacity="1"/>
-<stop offset="0.140625" stop-color="rgb(61.508179%, 88.238525%, 61.508179%)" stop-opacity="1"/>
-<stop offset="0.148438" stop-color="rgb(61.727905%, 88.305664%, 61.727905%)" stop-opacity="1"/>
-<stop offset="0.15625" stop-color="rgb(61.949158%, 88.372803%, 61.949158%)" stop-opacity="1"/>
-<stop offset="0.164062" stop-color="rgb(62.168884%, 88.439941%, 62.168884%)" stop-opacity="1"/>
-<stop offset="0.171875" stop-color="rgb(62.390137%, 88.50708%, 62.390137%)" stop-opacity="1"/>
-<stop offset="0.179688" stop-color="rgb(62.609863%, 88.574219%, 62.609863%)" stop-opacity="1"/>
-<stop offset="0.1875" stop-color="rgb(62.831116%, 88.642883%, 62.831116%)" stop-opacity="1"/>
-<stop offset="0.195312" stop-color="rgb(63.052368%, 88.710022%, 63.052368%)" stop-opacity="1"/>
-<stop offset="0.203125" stop-color="rgb(63.272095%, 88.777161%, 63.272095%)" stop-opacity="1"/>
-<stop offset="0.210937" stop-color="rgb(63.493347%, 88.844299%, 63.493347%)" stop-opacity="1"/>
-<stop offset="0.21875" stop-color="rgb(63.713074%, 88.911438%, 63.713074%)" stop-opacity="1"/>
-<stop offset="0.226562" stop-color="rgb(63.934326%, 88.980103%, 63.934326%)" stop-opacity="1"/>
-<stop offset="0.234375" stop-color="rgb(64.154053%, 89.047241%, 64.154053%)" stop-opacity="1"/>
-<stop offset="0.242187" stop-color="rgb(64.375305%, 89.11438%, 64.375305%)" stop-opacity="1"/>
-<stop offset="0.25" stop-color="rgb(64.595032%, 89.181519%, 64.595032%)" stop-opacity="1"/>
-<stop offset="0.257812" stop-color="rgb(64.816284%, 89.248657%, 64.816284%)" stop-opacity="1"/>
-<stop offset="0.265625" stop-color="rgb(65.036011%, 89.315796%, 65.036011%)" stop-opacity="1"/>
-<stop offset="0.273437" stop-color="rgb(65.257263%, 89.38446%, 65.257263%)" stop-opacity="1"/>
-<stop offset="0.28125" stop-color="rgb(65.478516%, 89.451599%, 65.478516%)" stop-opacity="1"/>
-<stop offset="0.289062" stop-color="rgb(65.698242%, 89.518738%, 65.698242%)" stop-opacity="1"/>
-<stop offset="0.296875" stop-color="rgb(65.919495%, 89.585876%, 65.919495%)" stop-opacity="1"/>
-<stop offset="0.304687" stop-color="rgb(66.139221%, 89.653015%, 66.139221%)" stop-opacity="1"/>
-<stop offset="0.3125" stop-color="rgb(66.360474%, 89.720154%, 66.360474%)" stop-opacity="1"/>
-<stop offset="0.320312" stop-color="rgb(66.5802%, 89.788818%, 66.5802%)" stop-opacity="1"/>
-<stop offset="0.328125" stop-color="rgb(66.801453%, 89.855957%, 66.801453%)" stop-opacity="1"/>
-<stop offset="0.335937" stop-color="rgb(67.021179%, 89.923096%, 67.021179%)" stop-opacity="1"/>
-<stop offset="0.34375" stop-color="rgb(67.242432%, 89.990234%, 67.242432%)" stop-opacity="1"/>
-<stop offset="0.351562" stop-color="rgb(67.462158%, 90.057373%, 67.462158%)" stop-opacity="1"/>
-<stop offset="0.359375" stop-color="rgb(67.683411%, 90.124512%, 67.683411%)" stop-opacity="1"/>
-<stop offset="0.367187" stop-color="rgb(67.903137%, 90.193176%, 67.903137%)" stop-opacity="1"/>
-<stop offset="0.375" stop-color="rgb(68.12439%, 90.260315%, 68.12439%)" stop-opacity="1"/>
-<stop offset="0.382812" stop-color="rgb(68.345642%, 90.327454%, 68.345642%)" stop-opacity="1"/>
-<stop offset="0.390625" stop-color="rgb(68.565369%, 90.394592%, 68.565369%)" stop-opacity="1"/>
-<stop offset="0.398437" stop-color="rgb(68.786621%, 90.461731%, 68.786621%)" stop-opacity="1"/>
-<stop offset="0.40625" stop-color="rgb(69.006348%, 90.52887%, 69.006348%)" stop-opacity="1"/>
-<stop offset="0.414062" stop-color="rgb(69.2276%, 90.597534%, 69.2276%)" stop-opacity="1"/>
-<stop offset="0.421875" stop-color="rgb(69.447327%, 90.664673%, 69.447327%)" stop-opacity="1"/>
-<stop offset="0.429687" stop-color="rgb(69.668579%, 90.731812%, 69.668579%)" stop-opacity="1"/>
-<stop offset="0.4375" stop-color="rgb(69.888306%, 90.79895%, 69.888306%)" stop-opacity="1"/>
-<stop offset="0.445312" stop-color="rgb(70.109558%, 90.866089%, 70.109558%)" stop-opacity="1"/>
-<stop offset="0.453125" stop-color="rgb(70.329285%, 90.933228%, 70.329285%)" stop-opacity="1"/>
-<stop offset="0.460937" stop-color="rgb(70.550537%, 91.001892%, 70.550537%)" stop-opacity="1"/>
-<stop offset="0.46875" stop-color="rgb(70.770264%, 91.069031%, 70.770264%)" stop-opacity="1"/>
-<stop offset="0.476562" stop-color="rgb(70.991516%, 91.136169%, 70.991516%)" stop-opacity="1"/>
-<stop offset="0.484375" stop-color="rgb(71.212769%, 91.203308%, 71.212769%)" stop-opacity="1"/>
-<stop offset="0.492188" stop-color="rgb(71.432495%, 91.270447%, 71.432495%)" stop-opacity="1"/>
-<stop offset="0.5" stop-color="rgb(71.653748%, 91.337585%, 71.653748%)" stop-opacity="1"/>
-<stop offset="0.507812" stop-color="rgb(71.873474%, 91.40625%, 71.873474%)" stop-opacity="1"/>
-<stop offset="0.515625" stop-color="rgb(72.094727%, 91.473389%, 72.094727%)" stop-opacity="1"/>
-<stop offset="0.523437" stop-color="rgb(72.314453%, 91.540527%, 72.314453%)" stop-opacity="1"/>
-<stop offset="0.53125" stop-color="rgb(72.535706%, 91.607666%, 72.535706%)" stop-opacity="1"/>
-<stop offset="0.539063" stop-color="rgb(72.755432%, 91.674805%, 72.755432%)" stop-opacity="1"/>
-<stop offset="0.546875" stop-color="rgb(72.976685%, 91.741943%, 72.976685%)" stop-opacity="1"/>
-<stop offset="0.554688" stop-color="rgb(73.196411%, 91.810608%, 73.196411%)" stop-opacity="1"/>
-<stop offset="0.5625" stop-color="rgb(73.417664%, 91.877747%, 73.417664%)" stop-opacity="1"/>
-<stop offset="0.570312" stop-color="rgb(73.638916%, 91.944885%, 73.638916%)" stop-opacity="1"/>
-<stop offset="0.578125" stop-color="rgb(73.858643%, 92.012024%, 73.858643%)" stop-opacity="1"/>
-<stop offset="0.585938" stop-color="rgb(74.079895%, 92.079163%, 74.079895%)" stop-opacity="1"/>
-<stop offset="0.59375" stop-color="rgb(74.299622%, 92.146301%, 74.299622%)" stop-opacity="1"/>
-<stop offset="0.601563" stop-color="rgb(74.520874%, 92.214966%, 74.520874%)" stop-opacity="1"/>
-<stop offset="0.609375" stop-color="rgb(74.740601%, 92.282104%, 74.740601%)" stop-opacity="1"/>
-<stop offset="0.617188" stop-color="rgb(74.961853%, 92.349243%, 74.961853%)" stop-opacity="1"/>
-<stop offset="0.625" stop-color="rgb(75.18158%, 92.416382%, 75.18158%)" stop-opacity="1"/>
-<stop offset="0.632813" stop-color="rgb(75.402832%, 92.483521%, 75.402832%)" stop-opacity="1"/>
-<stop offset="0.640625" stop-color="rgb(75.624084%, 92.550659%, 75.624084%)" stop-opacity="1"/>
-<stop offset="0.648438" stop-color="rgb(75.843811%, 92.619324%, 75.843811%)" stop-opacity="1"/>
-<stop offset="0.65625" stop-color="rgb(76.065063%, 92.686462%, 76.065063%)" stop-opacity="1"/>
-<stop offset="0.664062" stop-color="rgb(76.28479%, 92.753601%, 76.28479%)" stop-opacity="1"/>
-<stop offset="0.671875" stop-color="rgb(76.506042%, 92.82074%, 76.506042%)" stop-opacity="1"/>
-<stop offset="0.679688" stop-color="rgb(76.725769%, 92.887878%, 76.725769%)" stop-opacity="1"/>
-<stop offset="0.6875" stop-color="rgb(76.947021%, 92.955017%, 76.947021%)" stop-opacity="1"/>
-<stop offset="0.695312" stop-color="rgb(77.166748%, 93.022156%, 77.166748%)" stop-opacity="1"/>
-<stop offset="0.703125" stop-color="rgb(77.388%, 93.09082%, 77.388%)" stop-opacity="1"/>
-<stop offset="0.710938" stop-color="rgb(77.607727%, 93.157959%, 77.607727%)" stop-opacity="1"/>
-<stop offset="0.71875" stop-color="rgb(77.828979%, 93.225098%, 77.828979%)" stop-opacity="1"/>
-<stop offset="0.726562" stop-color="rgb(78.050232%, 93.292236%, 78.050232%)" stop-opacity="1"/>
-<stop offset="0.734375" stop-color="rgb(78.269958%, 93.359375%, 78.269958%)" stop-opacity="1"/>
-<stop offset="0.742188" stop-color="rgb(78.491211%, 93.426514%, 78.491211%)" stop-opacity="1"/>
-<stop offset="0.75" stop-color="rgb(78.710938%, 93.495178%, 78.710938%)" stop-opacity="1"/>
-<stop offset="0.757813" stop-color="rgb(78.93219%, 93.562317%, 78.93219%)" stop-opacity="1"/>
-<stop offset="0.765625" stop-color="rgb(79.151917%, 93.629456%, 79.151917%)" stop-opacity="1"/>
-<stop offset="0.773438" stop-color="rgb(79.373169%, 93.696594%, 79.373169%)" stop-opacity="1"/>
-<stop offset="0.78125" stop-color="rgb(79.592896%, 93.763733%, 79.592896%)" stop-opacity="1"/>
-<stop offset="0.789062" stop-color="rgb(79.814148%, 93.830872%, 79.814148%)" stop-opacity="1"/>
-<stop offset="0.796875" stop-color="rgb(80.0354%, 93.899536%, 80.0354%)" stop-opacity="1"/>
-<stop offset="0.804688" stop-color="rgb(80.255127%, 93.966675%, 80.255127%)" stop-opacity="1"/>
-<stop offset="0.8125" stop-color="rgb(80.476379%, 94.033813%, 80.476379%)" stop-opacity="1"/>
-<stop offset="0.820312" stop-color="rgb(80.696106%, 94.100952%, 80.696106%)" stop-opacity="1"/>
-<stop offset="0.828125" stop-color="rgb(80.917358%, 94.168091%, 80.917358%)" stop-opacity="1"/>
-<stop offset="0.835938" stop-color="rgb(81.137085%, 94.235229%, 81.137085%)" stop-opacity="1"/>
-<stop offset="0.84375" stop-color="rgb(81.358337%, 94.303894%, 81.358337%)" stop-opacity="1"/>
-<stop offset="0.851562" stop-color="rgb(81.578064%, 94.371033%, 81.578064%)" stop-opacity="1"/>
-<stop offset="0.859375" stop-color="rgb(81.799316%, 94.438171%, 81.799316%)" stop-opacity="1"/>
-<stop offset="0.867188" stop-color="rgb(82.020569%, 94.50531%, 82.020569%)" stop-opacity="1"/>
-<stop offset="0.875" stop-color="rgb(82.240295%, 94.572449%, 82.240295%)" stop-opacity="1"/>
-<stop offset="0.882813" stop-color="rgb(82.461548%, 94.639587%, 82.461548%)" stop-opacity="1"/>
-<stop offset="0.890625" stop-color="rgb(82.681274%, 94.706726%, 82.681274%)" stop-opacity="1"/>
-<stop offset="0.898438" stop-color="rgb(82.902527%, 94.775391%, 82.902527%)" stop-opacity="1"/>
-<stop offset="0.90625" stop-color="rgb(83.122253%, 94.842529%, 83.122253%)" stop-opacity="1"/>
-<stop offset="0.914062" stop-color="rgb(83.343506%, 94.909668%, 83.343506%)" stop-opacity="1"/>
-<stop offset="0.921875" stop-color="rgb(83.563232%, 94.976807%, 83.563232%)" stop-opacity="1"/>
-<stop offset="0.929688" stop-color="rgb(83.784485%, 95.043945%, 83.784485%)" stop-opacity="1"/>
-<stop offset="0.9375" stop-color="rgb(84.004211%, 95.111084%, 84.004211%)" stop-opacity="1"/>
-<stop offset="0.945312" stop-color="rgb(84.225464%, 95.179749%, 84.225464%)" stop-opacity="1"/>
-<stop offset="0.953125" stop-color="rgb(84.446716%, 95.246887%, 84.446716%)" stop-opacity="1"/>
-<stop offset="0.960938" stop-color="rgb(84.666443%, 95.314026%, 84.666443%)" stop-opacity="1"/>
-<stop offset="0.96875" stop-color="rgb(84.887695%, 95.381165%, 84.887695%)" stop-opacity="1"/>
-<stop offset="0.976562" stop-color="rgb(85.107422%, 95.448303%, 85.107422%)" stop-opacity="1"/>
-<stop offset="0.984375" stop-color="rgb(85.328674%, 95.515442%, 85.328674%)" stop-opacity="1"/>
-<stop offset="0.992187" stop-color="rgb(85.548401%, 95.584106%, 85.548401%)" stop-opacity="1"/>
-<stop offset="1" stop-color="rgb(85.769653%, 95.651245%, 85.769653%)" stop-opacity="1"/>
-</linearGradient>
-<clipPath id="clip-20">
-<path clip-rule="nonzero" d="M 280 201 L 349.308594 201 L 349.308594 225 L 280 225 Z M 280 201 "/>
-</clipPath>
-</defs>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-0" x="3.321" y="11.678"/>
-<use xlink:href="#glyph-0-1" x="10.840802" y="11.678"/>
-<use xlink:href="#glyph-0-1" x="17.093356" y="11.678"/>
-<use xlink:href="#glyph-0-2" x="23.34591" y="11.678"/>
-<use xlink:href="#glyph-0-3" x="26.358613" y="11.678"/>
-<use xlink:href="#glyph-0-4" x="29.466957" y="11.678"/>
-<use xlink:href="#glyph-0-5" x="34.595725" y="11.678"/>
-<use xlink:href="#glyph-0-6" x="40.346161" y="11.678"/>
-<use xlink:href="#glyph-0-3" x="44.410919" y="11.678"/>
-<use xlink:href="#glyph-0-7" x="47.519263" y="11.678"/>
-<use xlink:href="#glyph-0-8" x="53.496848" y="11.678"/>
-</g>
-<g clip-path="url(#clip-0)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-0)" d="M 8.953125 215.839844 L 8.953125 193.160156 L 54.308594 193.160156 L 54.308594 215.839844 Z M 8.953125 215.839844 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(47.059631%, 17.64679%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 0.000125 -51.024844 L 0.000125 -28.345156 L 45.355594 -28.345156 L 45.355594 -51.024844 Z M 0.000125 -51.024844 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-0" x="21.04" y="207.928"/>
-<use xlink:href="#glyph-1-1" x="27.017584" y="207.928"/>
-<use xlink:href="#glyph-1-2" x="31.809614" y="207.928"/>
-<use xlink:href="#glyph-1-3" x="35.196911" y="207.928"/>
-<use xlink:href="#glyph-1-4" x="39.709987" y="207.928"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 45.355594 -39.685 L 89.449344 20.943906 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 99.664062 142.136719 L 96.261719 143.902344 L 98.402344 143.871094 L 99.035156 145.917969 "/>
-<g clip-path="url(#clip-1)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-1)" d="M 8.953125 181.824219 L 8.953125 159.144531 L 54.308594 159.144531 L 54.308594 181.824219 Z M 8.953125 181.824219 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(47.059631%, 17.64679%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 0.000125 -17.009219 L 0.000125 5.670469 L 45.355594 5.670469 L 45.355594 -17.009219 Z M 0.000125 -17.009219 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-5" x="18.818" y="173.713"/>
-<use xlink:href="#glyph-1-6" x="24.387116" y="173.713"/>
-<use xlink:href="#glyph-1-7" x="29.747016" y="173.713"/>
-<use xlink:href="#glyph-1-8" x="35.126842" y="173.713"/>
-<use xlink:href="#glyph-1-9" x="40.167938" y="173.713"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 45.355594 -5.669375 L 88.945437 24.299375 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 99.664062 139.304688 L 95.867188 139.832031 L 97.898438 140.515625 L 97.808594 142.65625 "/>
-<g clip-path="url(#clip-2)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-2)" d="M 8.953125 147.808594 L 8.953125 125.128906 L 54.308594 125.128906 L 54.308594 147.808594 Z M 8.953125 147.808594 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(47.059631%, 17.64679%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 0.000125 17.006406 L 0.000125 39.686094 L 45.355594 39.686094 L 45.355594 17.006406 Z M 0.000125 17.006406 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-10" x="14.026" y="139.896"/>
-<use xlink:href="#glyph-1-3" x="21.179176" y="139.896"/>
-<use xlink:href="#glyph-1-11" x="25.692251" y="139.896"/>
-<use xlink:href="#glyph-1-12" x="30.703459" y="139.896"/>
-<use xlink:href="#glyph-1-13" x="35.336087" y="139.896"/>
-<use xlink:href="#glyph-1-13" x="39.968715" y="139.896"/>
-<use xlink:href="#glyph-1-13" x="44.601342" y="139.896"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 45.355594 28.34625 L 88.566531 28.34625 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 99.664062 136.46875 L 96.234375 134.753906 L 97.519531 136.46875 L 96.234375 138.183594 "/>
-<g clip-path="url(#clip-3)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-3)" d="M 8.953125 113.792969 L 8.953125 91.113281 L 54.308594 91.113281 L 54.308594 113.792969 Z M 8.953125 113.792969 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(47.059631%, 17.64679%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 0.000125 51.022031 L 0.000125 73.701719 L 45.355594 73.701719 L 45.355594 51.022031 Z M 0.000125 51.022031 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-14" x="17.613" y="105.736"/>
-<use xlink:href="#glyph-1-15" x="26.439899" y="105.736"/>
-<use xlink:href="#glyph-1-6" x="31.46107" y="105.736"/>
-<use xlink:href="#glyph-1-14" x="36.82097" y="105.736"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 45.355594 62.361875 L 88.945437 32.393125 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 99.664062 133.632812 L 97.808594 130.28125 L 97.898438 132.421875 L 95.867188 133.105469 "/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-9" x="104.46" y="11.666"/>
-<use xlink:href="#glyph-0-3" x="110.616913" y="11.666"/>
-<use xlink:href="#glyph-0-10" x="113.725257" y="11.666"/>
-<use xlink:href="#glyph-0-11" x="119.750662" y="11.666"/>
-<use xlink:href="#glyph-0-5" x="123.994748" y="11.666"/>
-<use xlink:href="#glyph-0-11" x="129.745185" y="11.666"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-12" x="134.168597" y="11.666"/>
-</g>
-<g clip-path="url(#clip-4)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-4)" d="M 99.664062 147.808594 L 99.664062 125.128906 L 145.019531 125.128906 L 145.019531 147.808594 Z M 99.664062 147.808594 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(23.529053%, 35.293579%, 54.116821%)" stroke-opacity="1" stroke-miterlimit="10" d="M 90.711062 17.006406 L 90.711062 39.686094 L 136.066531 39.686094 L 136.066531 17.006406 Z M 90.711062 17.006406 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-4" x="105.009" y="139.896"/>
-<use xlink:href="#glyph-1-16" x="107.519585" y="139.896"/>
-<use xlink:href="#glyph-1-17" x="110.109872" y="139.896"/>
-<use xlink:href="#glyph-1-18" x="115.131042" y="139.896"/>
-<use xlink:href="#glyph-1-6" x="121.87575" y="139.896"/>
-<use xlink:href="#glyph-1-6" x="127.23565" y="139.896"/>
-<use xlink:href="#glyph-1-19" x="132.59555" y="139.896"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 180.550906 128.44 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 34.417969 L 187.414062 36.855469 L 189.503906 36.375 L 190.546875 38.246094 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 180.230594 94.59625 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 68.4375 L 187.046875 70.335938 L 189.183594 70.21875 L 189.898438 72.238281 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 179.707156 61.076719 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 102.453125 L 186.601562 103.136719 L 188.660156 103.738281 L 188.660156 105.878906 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 179.277469 28.34625 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 136.46875 L 186.945312 134.753906 L 188.230469 136.46875 L 186.945312 138.183594 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 179.707156 -4.384219 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 170.484375 L 188.660156 167.058594 L 188.660156 169.199219 L 186.601562 169.796875 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 180.230594 -37.90375 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 204.5 L 189.898438 200.699219 L 189.183594 202.71875 L 187.046875 202.597656 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 136.066531 28.34625 L 180.550906 -71.743594 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 190.371094 238.515625 L 190.546875 234.6875 L 189.503906 236.558594 L 187.414062 236.082031 "/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-13" x="193.706" y="13.023"/>
-<use xlink:href="#glyph-0-5" x="201.070385" y="13.023"/>
-<use xlink:href="#glyph-0-4" x="206.820821" y="13.023"/>
-<use xlink:href="#glyph-0-14" x="211.949589" y="13.023"/>
-<use xlink:href="#glyph-0-15" x="217.96304" y="13.023"/>
-<use xlink:href="#glyph-0-8" x="223.378732" y="13.023"/>
-<use xlink:href="#glyph-0-16" x="229.822569" y="13.023"/>
-<use xlink:href="#glyph-0-17" x="236.158809" y="13.023"/>
-</g>
-<g clip-path="url(#clip-5)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-5)" d="M 190.371094 45.757812 L 190.371094 23.082031 L 247.066406 23.082031 L 247.066406 45.757812 Z M 190.371094 45.757812 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 119.057187 L 181.418094 141.732969 L 238.113406 141.732969 L 238.113406 119.057187 Z M 181.418094 119.057187 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-5" x="204.654" y="37.65"/>
-<use xlink:href="#glyph-1-20" x="210.223116" y="37.65"/>
-<use xlink:href="#glyph-1-21" x="215.573053" y="37.65"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-3" x="219.03009" y="37.65"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-18" x="226.033825" y="37.65"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 130.393125 L 270.246219 112.990781 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 52.84375 L 278.886719 49.707031 L 279.199219 51.824219 L 277.253906 52.71875 "/>
-<g clip-path="url(#clip-6)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-6)" d="M 190.371094 79.773438 L 190.371094 57.097656 L 247.066406 57.097656 L 247.066406 79.773438 Z M 190.371094 79.773438 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 85.041562 L 181.418094 107.717344 L 238.113406 107.717344 L 238.113406 85.041562 Z M 181.418094 85.041562 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-22" x="210.104" y="71.665"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-23" x="215.254685" y="71.665"/>
-<use xlink:href="#glyph-1-24" x="221.541111" y="71.665"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 96.3775 L 270.1095 107.709531 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 56.390625 L 277.28125 55.917969 L 279.0625 57.105469 L 278.421875 59.148438 "/>
-<g clip-path="url(#clip-7)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-7)" d="M 190.371094 113.792969 L 190.371094 91.113281 L 247.066406 91.113281 L 247.066406 113.792969 Z M 190.371094 113.792969 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 51.022031 L 181.418094 73.701719 L 238.113406 73.701719 L 238.113406 51.022031 Z M 181.418094 51.022031 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-25" x="197.377" y="105.701"/>
-<use xlink:href="#glyph-1-26" x="202.50776" y="105.701"/>
-<use xlink:href="#glyph-1-27" x="205.436776" y="105.701"/>
-<use xlink:href="#glyph-1-24" x="211.573762" y="105.701"/>
-<use xlink:href="#glyph-1-8" x="217.362056" y="105.701"/>
-<use xlink:href="#glyph-1-14" x="222.403152" y="105.701"/>
-<use xlink:href="#glyph-1-14" x="231.230051" y="105.701"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 62.361875 L 270.777469 102.514219 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 60.640625 L 277.589844 62.21875 L 279.730469 62.300781 L 280.25 64.378906 "/>
-<g clip-path="url(#clip-8)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-8)" d="M 190.371094 147.808594 L 190.371094 125.128906 L 247.066406 125.128906 L 247.066406 147.808594 Z M 190.371094 147.808594 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 17.006406 L 181.418094 39.686094 L 238.113406 39.686094 L 238.113406 17.006406 Z M 181.418094 17.006406 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-18" x="205.372" y="139.697"/>
-<use xlink:href="#glyph-1-28" x="212.116707" y="139.697"/>
-<use xlink:href="#glyph-1-19" x="219.010854" y="139.697"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-22" x="225.795412" y="139.697"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 28.34625 L 270.097781 39.006406 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 125.128906 L 277.289062 124.585938 L 279.050781 125.808594 L 278.371094 127.839844 "/>
-<g clip-path="url(#clip-9)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-9)" d="M 190.371094 181.824219 L 190.371094 159.144531 L 247.066406 159.144531 L 247.066406 181.824219 Z M 190.371094 181.824219 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 -17.009219 L 181.418094 5.670469 L 238.113406 5.670469 L 238.113406 -17.009219 Z M 181.418094 -17.009219 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-29" x="210.851" y="173.718"/>
-<use xlink:href="#glyph-1-26" x="218.083877" y="173.718"/>
-<use xlink:href="#glyph-1-5" x="221.012893" y="173.718"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 -5.669375 L 269.988406 -5.005313 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 169.777344 L 277.621094 168.132812 L 278.941406 169.820312 L 277.691406 171.5625 "/>
-<g clip-path="url(#clip-10)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-10)" d="M 190.371094 215.839844 L 190.371094 193.160156 L 247.066406 193.160156 L 247.066406 215.839844 Z M 190.371094 215.839844 "/>
-</g>
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 -51.024844 L 181.418094 -28.345156 L 238.113406 -28.345156 L 238.113406 -51.024844 Z M 181.418094 -51.024844 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-8" x="207.544" y="207.728"/>
-<use xlink:href="#glyph-1-30" x="212.585096" y="207.728"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-18" x="218.014735" y="207.728"/>
-<use xlink:href="#glyph-1-25" x="224.759442" y="207.728"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 -39.685 L 270.050906 -47.669375 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 213.003906 L 278.171875 210.511719 L 279.003906 212.484375 L 277.339844 213.835938 "/>
-<g clip-path="url(#clip-11)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-11)" d="M 190.371094 249.855469 L 190.371094 227.179688 L 247.066406 227.179688 L 247.066406 249.855469 Z M 190.371094 249.855469 "/>
-</g>
-<g clip-path="url(#clip-12)">
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(19.999695%, 19.999695%, 19.999695%)" stroke-opacity="1" stroke-miterlimit="10" d="M 181.418094 -85.040469 L 181.418094 -62.364688 L 238.113406 -62.364688 L 238.113406 -85.040469 Z M 181.418094 -85.040469 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-14" x="200.221" y="241.764"/>
-<use xlink:href="#glyph-1-22" x="209.047899" y="241.764"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-31" x="214.97567" y="241.764"/>
-<use xlink:href="#glyph-1-14" x="222.118883" y="241.764"/>
-<use xlink:href="#glyph-1-22" x="230.945782" y="241.764"/>
-</g>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 -73.700625 L 271.492313 33.389219 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 129.382812 L 278.425781 132.144531 L 280.445312 131.425781 L 281.699219 133.164062 "/>
-<path fill="none" stroke-width="0.49814" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(0%, 0%, 0%)" stroke-opacity="1" stroke-miterlimit="10" d="M 238.113406 -73.700625 L 271.156375 -8.993594 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-<path fill-rule="nonzero" fill="rgb(0%, 0%, 0%)" fill-opacity="1" d="M 281.082031 171.902344 L 277.996094 174.175781 L 280.109375 173.808594 L 281.050781 175.734375 "/>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-18" x="283.578" y="13.023"/>
-<use xlink:href="#glyph-0-5" x="292.257453" y="13.023"/>
-<use xlink:href="#glyph-0-11" x="298.00789" y="13.023"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-16" x="302.156334" y="13.023"/>
-<use xlink:href="#glyph-0-19" x="308.492574" y="13.023"/>
-<use xlink:href="#glyph-0-5" x="317.124207" y="13.023"/>
-<use xlink:href="#glyph-0-11" x="322.874644" y="13.023"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-0-15" x="327.023088" y="13.023"/>
-</g>
-<g clip-path="url(#clip-13)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-12)" d="M 281.082031 68.4375 L 281.082031 45.757812 L 349.113281 45.757812 L 349.113281 68.4375 Z M 281.082031 68.4375 "/>
-</g>
-<g clip-path="url(#clip-14)">
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(17.64679%, 47.059631%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 272.129031 96.3775 L 272.129031 119.057187 L 340.160281 119.057187 L 340.160281 96.3775 Z M 272.129031 96.3775 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-18" x="305.247" y="60.327"/>
-<use xlink:href="#glyph-1-5" x="311.991707" y="60.327"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-28" x="318.048992" y="60.327"/>
-</g>
-<g clip-path="url(#clip-15)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-13)" d="M 281.082031 139.304688 L 281.082031 116.625 L 349.113281 116.625 L 349.113281 139.304688 Z M 281.082031 139.304688 "/>
-</g>
-<g clip-path="url(#clip-16)">
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(17.64679%, 47.059631%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 272.129031 25.510312 L 272.129031 48.19 L 340.160281 48.19 L 340.160281 25.510312 Z M 272.129031 25.510312 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-10" x="287.364" y="131.193"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-23" x="294.756279" y="131.193"/>
-<use xlink:href="#glyph-1-26" x="301.042705" y="131.193"/>
-<use xlink:href="#glyph-1-19" x="303.971721" y="131.193"/>
-<use xlink:href="#glyph-1-26" x="311.045195" y="131.193"/>
-<use xlink:href="#glyph-1-22" x="313.974211" y="131.193"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-31" x="322.731372" y="131.193"/>
-<use xlink:href="#glyph-1-5" x="329.874585" y="131.193"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-28" x="335.93187" y="131.193"/>
-</g>
-<g clip-path="url(#clip-17)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-14)" d="M 281.082031 181.824219 L 281.082031 159.144531 L 349.113281 159.144531 L 349.113281 181.824219 Z M 281.082031 181.824219 "/>
-</g>
-<g clip-path="url(#clip-18)">
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(17.64679%, 47.059631%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 272.129031 -17.009219 L 272.129031 5.670469 L 340.160281 5.670469 L 340.160281 -17.009219 Z M 272.129031 -17.009219 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-22" x="292.719" y="173.733"/>
-<use xlink:href="#glyph-1-14" x="298.985501" y="173.733"/>
-<use xlink:href="#glyph-1-19" x="307.8124" y="173.733"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-31" x="317.376534" y="173.733"/>
-<use xlink:href="#glyph-1-5" x="324.519747" y="173.733"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-28" x="330.577032" y="173.733"/>
-</g>
-<g clip-path="url(#clip-19)">
-<path fill-rule="nonzero" fill="url(#linear-pattern-15)" d="M 281.082031 224.34375 L 281.082031 201.664062 L 349.113281 201.664062 L 349.113281 224.34375 Z M 281.082031 224.34375 "/>
-</g>
-<g clip-path="url(#clip-20)">
-<path fill="none" stroke-width="0.3985" stroke-linecap="butt" stroke-linejoin="miter" stroke="rgb(17.64679%, 47.059631%, 17.64679%)" stroke-opacity="1" stroke-miterlimit="10" d="M 272.129031 -59.52875 L 272.129031 -36.849063 L 340.160281 -36.849063 L 340.160281 -59.52875 Z M 272.129031 -59.52875 " transform="matrix(1, 0, 0, -1, 8.953, 164.815)"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-26" x="294.447" y="216.432"/>
-<use xlink:href="#glyph-1-32" x="297.376016" y="216.432"/>
-<use xlink:href="#glyph-1-2" x="302.745879" y="216.432"/>
-<use xlink:href="#glyph-1-3" x="306.133177" y="216.432"/>
-<use xlink:href="#glyph-1-4" x="310.646253" y="216.432"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-31" x="315.647498" y="216.432"/>
-<use xlink:href="#glyph-1-5" x="322.790711" y="216.432"/>
-</g>
-<g fill="rgb(0%, 0%, 0%)" fill-opacity="1">
-<use xlink:href="#glyph-1-28" x="328.847996" y="216.432"/>
-</g>
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   version="1.1"
+   id="svg1"
+   width="450.62799"
+   height="333.42267"
+   viewBox="0 0 450.62799 333.42267"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <defs
+     id="defs1">
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient5">
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0"
+         id="stop1" />
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0.25"
+         id="stop2" />
+      <stop
+         style="stop-opacity:1;stop-color:#f1d2d2"
+         offset="0.5"
+         id="stop3" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="0.75"
+         id="stop4" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="1"
+         id="stop5" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath6">
+      <path
+         d="M 0,68.03218 V 90.7097 H 45.35506 V 68.03218 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path6" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient14">
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0"
+         id="stop10" />
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0.25"
+         id="stop11" />
+      <stop
+         style="stop-opacity:1;stop-color:#f1d2d2"
+         offset="0.5"
+         id="stop12" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="0.75"
+         id="stop13" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="1"
+         id="stop14" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath15">
+      <path
+         d="M 0,34.01608 V 56.6936 H 45.35506 V 34.01608 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path15" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient23">
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0"
+         id="stop19" />
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0.25"
+         id="stop20" />
+      <stop
+         style="stop-opacity:1;stop-color:#f1d2d2"
+         offset="0.5"
+         id="stop21" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="0.75"
+         id="stop22" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="1"
+         id="stop23" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath24">
+      <path
+         d="M 0,0 V 22.67752 H 45.35506 V 0 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path24" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient32">
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0"
+         id="stop28" />
+      <stop
+         style="stop-opacity:1;stop-color:#e9b7b7"
+         offset="0.25"
+         id="stop29" />
+      <stop
+         style="stop-opacity:1;stop-color:#f1d2d2"
+         offset="0.5"
+         id="stop30" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="0.75"
+         id="stop31" />
+      <stop
+         style="stop-opacity:1;stop-color:#faeded"
+         offset="1"
+         id="stop32" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath33">
+      <path
+         d="m 0,-34.01648 v 22.67752 h 45.35506 v -22.67752 z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path33" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient42">
+      <stop
+         style="stop-opacity:1;stop-color:#c1d5f5"
+         offset="0"
+         id="stop38" />
+      <stop
+         style="stop-opacity:1;stop-color:#c1d5f5"
+         offset="0.25"
+         id="stop39" />
+      <stop
+         style="stop-opacity:1;stop-color:#d8e5f9"
+         offset="0.5"
+         id="stop40" />
+      <stop
+         style="stop-opacity:1;stop-color:#f0f5fd"
+         offset="0.75"
+         id="stop41" />
+      <stop
+         style="stop-opacity:1;stop-color:#f0f5fd"
+         offset="1"
+         id="stop42" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath43">
+      <path
+         d="m 90.7097,17.00783 v 22.67752 h 45.35506 V 17.00783 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path43" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient64">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop60" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop61" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop62" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop63" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop64" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath65">
+      <path
+         d="m 181.4194,119.0565 v 22.67752 h 56.69362 V 119.0565 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path65" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient73">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop69" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop70" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop71" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop72" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop73" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath74">
+      <path
+         d="m 181.4194,85.04042 v 22.67752 h 56.69362 V 85.04042 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path74" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient82">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop78" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop79" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop80" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop81" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop82" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath83">
+      <path
+         d="m 181.4194,51.02391 v 22.67752 h 56.69362 V 51.02391 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path83" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient91">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop87" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop88" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop89" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop90" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop91" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath92">
+      <path
+         d="m 181.4194,17.00783 v 22.67752 h 56.69362 V 17.00783 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path92" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient100">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop96" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop97" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop98" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop99" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop100" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath101">
+      <path
+         d="M 181.4194,-17.00824 V 5.66927 h 56.69362 v -22.67751 z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path101" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient109">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop105" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop106" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop107" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop108" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop109" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath110">
+      <path
+         d="m 181.4194,-51.02432 v 22.67752 h 56.69362 v -22.67752 z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path110" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient118">
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0"
+         id="stop114" />
+      <stop
+         style="stop-opacity:1;stop-color:#cccccc"
+         offset="0.25"
+         id="stop115" />
+      <stop
+         style="stop-opacity:1;stop-color:#dfdfdf"
+         offset="0.5"
+         id="stop116" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="0.75"
+         id="stop117" />
+      <stop
+         style="stop-opacity:1;stop-color:#f2f2f2"
+         offset="1"
+         id="stop118" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath119">
+      <path
+         d="m 181.4194,-85.0404 v 22.67752 h 56.69362 V -85.0404 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path119" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient130">
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0"
+         id="stop126" />
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0.25"
+         id="stop127" />
+      <stop
+         style="stop-opacity:1;stop-color:#b7e9b7"
+         offset="0.5"
+         id="stop128" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="0.75"
+         id="stop129" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="1"
+         id="stop130" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath131">
+      <path
+         d="m 272.12953,85.04042 v 22.67752 h 62.36289 V 85.04042 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path131" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient137">
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0"
+         id="stop133" />
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0.25"
+         id="stop134" />
+      <stop
+         style="stop-opacity:1;stop-color:#b7e9b7"
+         offset="0.5"
+         id="stop135" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="0.75"
+         id="stop136" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="1"
+         id="stop137" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath138">
+      <path
+         d="m 272.12953,17.00783 v 22.67752 h 62.36289 V 17.00783 Z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path138" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient144">
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0"
+         id="stop140" />
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0.25"
+         id="stop141" />
+      <stop
+         style="stop-opacity:1;stop-color:#b7e9b7"
+         offset="0.5"
+         id="stop142" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="0.75"
+         id="stop143" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="1"
+         id="stop144" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath145">
+      <path
+         d="M 272.12953,-17.00824 V 5.66927 h 62.36289 v -22.67751 z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path145" />
+    </clipPath>
+    <linearGradient
+       x1="0"
+       y1="0"
+       x2="0"
+       y2="100.00128"
+       gradientUnits="userSpaceOnUse"
+       id="linearGradient151">
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0"
+         id="stop147" />
+      <stop
+         style="stop-opacity:1;stop-color:#93de93"
+         offset="0.25"
+         id="stop148" />
+      <stop
+         style="stop-opacity:1;stop-color:#b7e9b7"
+         offset="0.5"
+         id="stop149" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="0.75"
+         id="stop150" />
+      <stop
+         style="stop-opacity:1;stop-color:#dbf4db"
+         offset="1"
+         id="stop151" />
+    </linearGradient>
+    <clipPath
+       clipPathUnits="userSpaceOnUse"
+       id="clipPath152">
+      <path
+         d="m 272.12953,-51.02432 v 22.67752 h 62.36289 v -22.67752 z"
+         transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)"
+         id="path152" />
+    </clipPath>
+  </defs>
+  <g
+     id="g1">
+    <path
+       id="path1"
+       d="m 4.8645469,-3.2928672 c -0.3984375,0.023437 -0.890625,0.035156 -1.2421875,0.035156 -0.3515625,0 -0.9960938,-0.011719 -1.2539063,-0.046875 l 1.265625,-2.9179688 h 0.023437 c 0.4921875,1.1132813 0.890625,2.0976563 1.2070318,2.9296878 z m -2.71875,0.5625 c 0.2929687,-0.023437 1.1132812,-0.035156 1.5351562,-0.035156 0.4570313,0 1.1132813,0.011719 1.40625,0.035156 0.5742188,1.5703125 0.84375,2.50781251 0.9257813,2.76562501 C 6.2122031,1.015625e-4 6.4114219,1.015625e-4 6.6106406,1.015625e-4 c 0.1992188,0 0.515625,0 0.7148438,0.0351562475 C 6.7278281,-1.1717734 5.1575156,-5.2498984 4.0559531,-7.8631797 H 3.7161094 C 2.5676719,-5.2147422 1.3957969,-2.6014609 0.18876563,0.03525781 0.34110938,1.015625e-4 0.48173438,1.015625e-4 0.61064063,1.015625e-4 c 0.12890625,0 0.39843747,0 0.53906247,0.0351562475 0.1875,-0.7265625 0.5625,-1.71093751 0.9960938,-2.76562501 z m 7.3066406,-0.7851562 c 0.2695313,-0.6328125 0.9726565,-1.1835938 1.4414065,-1.1835938 0.808594,0 1.230469,0.7382813 1.230469,2.0976563 0,0.9726562 -0.28125,2.23828121 -1.558594,2.23828121 -0.1875,0 -0.6914065,-0.046875 -1.1132815,-0.5390625 z m 0,-0.6914063 v -0.8789062 c 0,-0.082031 -0.023437,-0.1171875 -0.070312,-0.1171875 -0.2695312,0.058594 -0.703125,0.070312 -0.9609375,0.035156 l -0.035156,0.035156 c 0.09375,0.5273437 0.1171875,1.4296875 0.1171875,2.3203125 v 3.19921871 c 0,0.89062499 -0.023437,1.75781249 -0.1171875,2.39062499 l 0.035156,0.035156 c 0.140625,-0.023437 0.4101563,-0.035156 0.5507813,-0.035156 0.1523437,0 0.421875,0.011719 0.5625,0.035156 l 0.023437,-0.035156 C 9.4641563,2.0977578 9.4524375,1.2891641 9.4524375,0.38682031 v -0.52734375 c 0.3164063,0.1640625 0.7617185,0.2578125 1.1835935,0.2578125 1.582032,0 2.53125,-1.19531246 2.53125,-2.88281246 0,-1.265625 -0.738281,-2.484375 -2.074218,-2.484375 -0.46875,0 -1.066407,0.2460937 -1.617188,1.078125 z m 6.2548825,0.6914063 c 0.269532,-0.6328125 0.972657,-1.1835938 1.441407,-1.1835938 0.808593,0 1.230468,0.7382813 1.230468,2.0976563 0,0.9726562 -0.28125,2.23828121 -1.558593,2.23828121 -0.1875,0 -0.691407,-0.046875 -1.113282,-0.5390625 z m 0,-0.6914063 v -0.8789062 c 0,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269531,0.058594 -0.703125,0.070312 -0.960938,0.035156 l -0.03516,0.035156 c 0.09375,0.5273437 0.117188,1.4296875 0.117188,2.3203125 v 3.19921871 c 0,0.89062499 -0.02344,1.75781249 -0.117188,2.39062499 l 0.03516,0.035156 c 0.140625,-0.023437 0.410157,-0.035156 0.550782,-0.035156 0.152343,0 0.421875,0.011719 0.5625,0.035156 l 0.02344,-0.035156 C 15.719039,2.0977578 15.70732,1.2891641 15.70732,0.38682031 v -0.52734375 c 0.316407,0.1640625 0.761719,0.2578125 1.183594,0.2578125 1.582031,0 2.53125,-1.19531246 2.53125,-2.88281246 0,-1.265625 -0.738281,-2.484375 -2.074219,-2.484375 -0.46875,0 -1.066406,0.2460937 -1.617187,1.078125 z m 5.396485,1.8164063 c 0,0.890625 -0.02344,1.75781246 -0.105469,2.3906249625 l 0.02344,0.0351562475 c 0.140625,-0.0234375 0.410157,-0.0351562475 0.5625,-0.0351562475 0.140625,0 0.410157,0.0117187475 0.5625,0.0351562475 L 22.170211,1.015625e-4 C 22.064742,-0.67958594 22.053023,-1.4764609 22.053023,-2.3905234 v -3.5859375 c 0,-0.9023438 0.04687,-1.4765625 0.117188,-2.25 0,-0.082031 -0.03516,-0.1171875 -0.117188,-0.1171875 -0.292968,0.1171875 -0.5625,0.1992187 -1.03125,0.234375 l -0.02344,0.035156 c 0.08203,0.515625 0.105469,1.40625 0.105469,2.3085938 z m 2.894531,-4.7460938 c 0,0.328125 0.28125,0.609375 0.609375,0.609375 0.316406,0 0.609375,-0.28125 0.609375,-0.609375 0,-0.3164062 -0.292969,-0.609375 -0.609375,-0.609375 -0.328125,0 -0.609375,0.2929688 -0.609375,0.609375 z m 0.117187,4.3242188 v 0.65625 c 0,0.9023437 -0.02344,1.51171871 -0.105468,2.1562499625 l 0.02344,0.0351562475 c 0.140625,-0.0234375 0.410156,-0.0351562475 0.5625,-0.0351562475 0.140625,0 0.410156,0.0117187475 0.5625,0.0351562475 L 25.18193,1.015625e-4 C 25.076461,-0.66786719 25.064742,-1.2420859 25.064742,-2.1561484 v -0.84375 c 0,-0.9023438 0.02344,-1.3242188 0.117188,-2.0507813 0,-0.1171875 -0.02344,-0.1289062 -0.117188,-0.1289062 -0.269531,0.035156 -0.785156,0.035156 -1.03125,0.011719 l -0.02344,0.035156 c 0.07031,0.515625 0.105468,1.4296875 0.105468,2.3203125 z m 5.018555,-1.9570313 c 0.457031,0 0.867188,0.3515625 1.195313,0.8789063 l 0.152343,-0.011719 0.28125,-0.9726562 -0.02344,-0.035156 c -0.375,-0.1992187 -0.996094,-0.3398437 -1.59375,-0.3398437 -1.242188,0 -2.542969,1.0195312 -2.542969,2.6484375 0,1.65234371 0.9375,2.71874996 2.402344,2.71874996 0.714844,0 1.277344,-0.234375 1.757812,-0.84375 l -0.234375,-0.2578125 h -0.04687 c -0.445312,0.41015625 -0.832031,0.50390625 -1.277343,0.50390625 -0.867188,0 -1.546875,-0.78515621 -1.546875,-2.17968751 0,-1.3242187 0.691406,-2.109375 1.476562,-2.109375 z m 5.844727,2.0742188 -0.04687,1.4296875 c 0,0.1523437 -0.07031,0.234375 -0.164063,0.30468746 -0.339844,0.2578125 -0.738281,0.46875 -1.101562,0.46875 -0.539063,0 -0.878907,-0.3515625 -0.878907,-0.72656246 0,-0.5390625 0.246094,-0.9492188 1.183594,-1.1953125 z m 0,2.12109371 c 0.128906,0.46875 0.492187,0.69140625 0.9375,0.69140625 0.292968,0 0.667968,-0.0703125 0.9375,-0.36328125 l -0.08203,-0.29296875 c -0.128906,0.0351563 -0.234375,0.0585938 -0.316406,0.0585938 -0.128906,0 -0.28125,-0.0234375 -0.363281,-0.09375 -0.117188,-0.10546875 -0.1875,-0.3984375 -0.1875,-0.93749996 0,-0.3398438 0.03516,-1.6640626 0.03516,-1.8046876 0,-1.6054687 -1.066406,-1.9335937 -1.96875,-1.9335937 -0.914062,0 -1.464844,0.4453125 -1.757812,0.6914063 l -0.03516,0.046875 0.199219,0.8085937 0.152344,0.011719 c 0.339844,-0.5390624 0.738281,-1.0195312 1.3125,-1.0195312 0.433594,0 1.148437,0.058594 1.148437,1.40625 0,0.09375 -0.04687,0.140625 -0.08203,0.1523437 l -1.113281,0.2460938 c -1.21875,0.28125 -1.980469,0.9257812 -1.980469,1.734375 0,0.89062501 0.609375,1.28906251 1.488281,1.28906251 0.667969,0 0.996094,-0.15234375 1.628907,-0.69140625 z m 3.18457,-4.55859371 c -0.257812,0 -0.503906,0 -0.644531,-0.023437 -0.08203,0.2109375 -0.152344,0.3164062 -0.269531,0.4921875 l 0.05859,0.082031 c 0.210938,-0.011719 0.574219,-0.011719 0.855469,-0.023437 v 1.640625 c 0,0.7148438 -0.03516,1.5820313 -0.03516,1.9335938 0,0.78515621 0.503906,1.14843746 1.054687,1.14843746 0.492188,0 0.878907,-0.1171874975 1.382813,-0.4453125 l -0.152344,-0.28125 c -0.363281,0.1171875 -0.632812,0.12890625 -0.9375,0.0820313 -0.28125,-0.0351563 -0.398437,-0.31640625 -0.398437,-0.98437506 0,-0.3515625 0.04687,-0.8671875 0.04687,-1.5820312 v -1.5117188 h 0.445312 c 0.292969,0 0.691406,0.011719 0.867188,0.023437 0.03516,-0.1875 0.07031,-0.3046875 0.140625,-0.4804687 l -0.05859,-0.09375 c -0.222656,0.011719 -0.5625,0.023437 -0.84375,0.023437 h -0.550781 c 0,-0.8789063 0,-1.0429688 0.05859,-1.7460938 0,-0.082031 -0.03516,-0.1171875 -0.117187,-0.1171875 -0.292969,0.1171875 -0.445313,0.2578125 -0.832031,0.3046875 l -0.02344,0.035156 c -0.02344,0.421875 -0.04687,0.8320312 -0.04687,1.5234375 z m 3.887695,-2.0039063 c 0,0.328125 0.28125,0.609375 0.609375,0.609375 0.316407,0 0.609375,-0.28125 0.609375,-0.609375 0,-0.3164062 -0.292968,-0.609375 -0.609375,-0.609375 -0.328125,0 -0.609375,0.2929688 -0.609375,0.609375 z m 0.117188,4.3242188 v 0.65625 c 0,0.9023437 -0.02344,1.51171871 -0.105469,2.1562499625 l 0.02344,0.0351562475 c 0.140625,-0.0234375 0.410156,-0.0351562475 0.5625,-0.0351562475 0.140625,0 0.410156,0.0117187475 0.5625,0.0351562475 L 43.234664,1.015625e-4 C 43.129195,-0.66786719 43.117477,-1.2420859 43.117477,-2.1561484 v -0.84375 c 0,-0.9023438 0.02344,-1.3242188 0.117187,-2.0507813 0,-0.1171875 -0.02344,-0.1289062 -0.117187,-0.1289062 -0.269532,0.035156 -0.785157,0.035156 -1.03125,0.011719 l -0.02344,0.035156 c 0.07031,0.515625 0.105469,1.4296875 0.105469,2.3203125 z m 2.487305,0.3632812 c 0,1.4296875 0.9375,2.56640626 2.53125,2.56640626 1.582031,0 2.53125,-1.11328125 2.53125,-2.67187496 0,-1.6289063 -0.867188,-2.6953125 -2.484375,-2.6953125 -1.570313,0 -2.578125,1.1015625 -2.578125,2.8007812 z m 2.496093,-2.3203125 c 1.265625,0 1.511719,0.984375 1.511719,2.4257813 0,1.1367187 -0.5625,1.98046871 -1.371094,1.98046871 -1.265625,0 -1.582031,-1.37109371 -1.582031,-2.28515621 0,-1.0429688 0.316406,-2.1210938 1.441406,-2.1210938 z m 8.519532,2.6132813 c 0,-0.375 0.02344,-0.8203125 0.02344,-1.2070313 0,-1.2421875 -0.398437,-1.8867187 -1.476562,-1.8867187 -0.457032,0 -1.300782,0.1875 -2.074219,1.078125 l -0.02344,-0.035156 v -0.8789062 c -0.01172,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269531,0.058594 -0.703125,0.070312 -0.960938,0.035156 l -0.03516,0.035156 c 0.09375,0.5273437 0.117188,1.4296875 0.117188,2.3203125 v 0.65625 c 0,0.9023437 -0.01172,1.51171871 -0.117188,2.1562499625 l 0.03516,0.0351562475 c 0.140625,-0.0234375 0.410157,-0.0351562475 0.550782,-0.0351562475 0.152343,0 0.421875,0.0117187475 0.5625,0.0351562475 L 52.225875,1.015625e-4 C 52.132125,-0.67958594 52.120406,-1.2420859 52.120406,-2.1561484 v -1.3945313 c 0.609375,-0.7148437 1.300782,-0.984375 1.734375,-0.984375 0.609375,0 0.867188,0.2460938 0.867188,1.2070313 v 1.171875 c 0,0.9023437 -0.02344,1.52343746 -0.117188,2.1562499625 l 0.02344,0.0351562475 c 0.152344,-0.0234375 0.421875,-0.0351562475 0.5625,-0.0351562475 0.152344,0 0.421875,0.0117187475 0.5625,0.0351562475 L 55.776656,1.015625e-4 C 55.682906,-0.67958594 55.671188,-1.2420859 55.671188,-2.1561484 Z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="Application"
+       transform="matrix(1.3333333,0,0,1.3333333,4.428,15.605333)" />
+    <g
+       id="g5"
+       clip-path="url(#clipPath6)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.2096133,0,0,-0.6048,-25.865307,144.18276)"
+         style="fill:url(#linearGradient5);stroke:none"
+         id="path5" />
+    </g>
+    <path
+       id="path7"
+       d="M 0,68.03218 V 90.7097 H 45.35506 V 68.03218 Z"
+       style="fill:none;stroke:#782d2d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path8"
+       d="m 1.897875,-5.6257266 c 0,-0.2929687 0.1757812,-0.4453125 0.890625,-0.4453125 0.6914062,0 1.4179687,0.1875 1.4179687,1.3828125 0,1.1367188 -0.5625,1.5117188 -1.5234375,1.5117188 -0.2578125,0 -0.65625,-0.035156 -0.7851562,-0.1054688 z m -0.8671875,1.1953125 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.9921875375 L 0.960375,0.03442969 c 0.1171875,-0.0234375 0.3867187,-0.0351562525 0.5039062,-0.0351562525 0.1171875,0 0.3867188,0.0117187525 0.5039063,0.0351562525 L 1.991625,-7.265625e-4 C 1.9095937,-0.56322656 1.897875,-1.2429141 1.897875,-1.9929141 v -0.8554687 c 0.2109375,0.070312 0.4804687,0.09375 0.8320312,0.09375 1.8164063,0 2.4375,-1.1367188 2.4375,-1.9921875 0,-0.7382813 -0.46875,-1.7460938 -2.3085937,-1.7460938 -0.2578125,0 -1.1015625,0.070312 -1.3945313,0.070312 -0.1171875,0 -0.3867187,-0.011719 -0.5039062,-0.035156 l -0.0234375,0.035156 c 0.0820312,0.5625 0.09375,1.2421875 0.09375,1.992188 z m 6.4365234,2.4375 v -1.1601562 c 0.46875,0 1.5234375,0.023437 2.2148438,0.09375 l 0.035156,-0.035156 c -0.023437,-0.082031 -0.035156,-0.2226562 -0.035156,-0.3164062 0,-0.082031 0.011719,-0.2226563 0.035156,-0.3164063 l -0.035156,-0.023437 c -0.5859375,0.046875 -1.0429688,0.09375 -2.2148438,0.09375 v -0.7734375 c 0,-0.1757812 0,-1.1953125 0.070312,-1.4414062 1.3359375,0 2.7187501,0.1289062 2.7187501,0.1289062 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.1640625 -0.01172,-0.2578125 0,-0.082031 0,-0.234375 0.01172,-0.375 l -0.01172,-0.035156 c -0.164062,0.023437 -0.3984376,0.035156 -0.5976563,0.035156 h -2.625 c -0.3398438,0 -0.5039063,-0.035156 -0.5039063,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.9921875375 L 6.5179922,0.03442969 c 0,0 0.1640625,-0.0351562525 0.515625,-0.0351562525 h 2.71875 c 0.2109375,0 0.4335938,0.0117187525 0.5976558,0.0351562525 l 0.02344,-0.0351562525 C 10.361742,-0.14135156 10.350023,-0.21166406 10.350023,-0.30541406 c 0,-0.10546875 0.01172,-0.2578125 0.02344,-0.328125 l -0.02344,-0.046875 c 0,0 -1.4765621,0.1171875 -2.8124996,0.1171875 -0.070312,-0.234375 -0.070312,-1.26562504 -0.070312,-1.42968754 z m 5.7246091,-2.4375 v 2.4375 c 0,0.75 -0.02344,1.45312504 -0.105468,1.9921875375 l 0.02344,0.0351562525 c 0.117188,-0.0234375 0.386719,-0.0351562525 0.515625,-0.0351562525 0.117188,0 0.386719,0.0117187525 0.503906,0.0351562525 l 0.02344,-0.0351562525 C 14.070727,-0.56322656 14.047289,-1.2429141 14.047289,-1.9929141 v -2.4375 c 0,-0.1757812 0.01172,-1.1953125 0.08203,-1.4414062 1.335938,0 2.015625,0.140625 2.015625,0.140625 l 0.01172,-0.035156 c -0.01172,-0.1523437 -0.01172,-0.375 0,-0.65625 l -0.01172,-0.035156 c -0.164062,0.023437 -0.398437,0.035156 -0.597656,0.035156 H 11.69182 c -0.199218,0 -0.433593,-0.011719 -0.597656,-0.035156 l -0.01172,0.035156 c 0.01172,0.28125 0.01172,0.5039063 0,0.65625 l 0.01172,0.035156 c 0,0 0.679688,-0.140625 2.015625,-0.140625 0.07031,0.2460937 0.08203,1.265625 0.08203,1.4414062 z m 5.53125,4.08984379 c -0.5625,0 -1.277343,-0.4921875 -1.582031,-1.07812499 l -0.105469,0.011719 c -0.03516,0.3632813 -0.152343,0.72656254 -0.222656,1.03125004 l 0.01172,0.0234375 c 0,0 0.644531,0.45703125 1.816406,0.45703125 1.21875,0 2.167969,-0.75 2.167969,-1.91015629 0,-1.1484375 -0.972656,-1.6875 -1.78125,-1.9921875 -0.503906,-0.1875 -1.21875,-0.4804687 -1.21875,-1.3007812 0,-0.3632813 0.199219,-0.7617188 0.457031,-0.9023438 0.175781,-0.082031 0.386719,-0.1171875 0.609375,-0.1171875 0.550781,0 1.089844,0.4335938 1.359375,1.0429688 l 0.09375,-0.011719 c 0.04687,-0.3515625 0.140625,-0.6914062 0.234375,-0.9960937 l -0.02344,-0.023437 c 0,0 -0.410157,-0.4453125 -1.582032,-0.4453125 -0.269531,0 -0.5625,0.046875 -0.855468,0.1640625 -0.585938,0.2460937 -1.089844,0.8320312 -1.089844,1.5585937 0,1.0429688 0.878906,1.5351563 1.710937,1.875 0.644532,0.2695313 1.160157,0.5976563 1.160157,1.453125 0,0.73828129 -0.574219,1.16015629 -1.160157,1.16015629 z m 5.124024,-3.63281249 c 0.375,0 0.726562,0.28125 0.996093,0.7382812 l 0.128907,-0.011719 0.222656,-0.8203125 -0.02344,-0.023437 c -0.304687,-0.1640625 -0.820312,-0.28125 -1.324218,-0.28125 -1.03125,0 -2.121094,0.84375 -2.121094,2.203125 0,1.38281249 0.785156,2.27343749 2.003906,2.27343749 0.597656,0 1.066406,-0.2109375 1.464844,-0.71484375 l -0.1875,-0.2109375 h -0.03516 c -0.386719,0.33984375 -0.703125,0.421875 -1.066407,0.421875 -0.726562,0 -1.300781,-0.65625004 -1.300781,-1.82812504 0,-1.0898437 0.574219,-1.7460937 1.242188,-1.7460937 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="PETSc"
+       transform="matrix(1.3333333,0,0,1.3333333,17.532,118.208)" />
+    <path
+       id="path9"
+       d="M 45.35506,79.37073 89.19551,35.53065"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path10"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.9428,0.94278667,0.94278667,-0.9428,123.306,172.39647)" />
+    <g
+       id="g14"
+       clip-path="url(#clipPath15)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.2096133,0,0,-0.6048,-25.865307,189.53756)"
+         style="fill:url(#linearGradient14);stroke:none"
+         id="path14" />
+    </g>
+    <path
+       id="path16"
+       d="M 0,34.01608 V 56.6936 H 45.35506 V 34.01608 Z"
+       style="fill:none;stroke:#782d2d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path17"
+       d="m 1.8995078,-2.8475469 c 0.3398438,0 0.6445313,-0.011719 0.9140625,-0.035156 0.28125,0.328125 0.4921875,0.7265625 0.75,1.125 0.3632813,0.5859375 0.84375,1.45312497 0.9492188,1.79296872 0.1875,-0.035156245 0.4101562,-0.035156245 0.5976562,-0.035156245 0.1992188,0 0.3984375,0 0.5976563,0.035156245 L 5.7315391,1.09375e-4 C 5.2276328,-0.56239063 4.2198203,-2.1327031 3.5987266,-2.9764531 c 0.2695312,-0.046875 0.46875,-0.1171875 0.609375,-0.1875 0.5390625,-0.2695313 0.9960937,-0.7851563 0.9960937,-1.6054688 0,-0.5039062 -0.1757812,-0.9140625 -0.5273437,-1.2421875 -0.4804688,-0.4570312 -1.2421875,-0.4804687 -1.8632813,-0.4804687 -0.2578125,0 -1.0546875,0.070312 -1.3476562,0.070312 -0.1171875,0 -0.3867188,-0.011719 -0.50390629,-0.035156 l -0.0234375,0.035156 c 0.0820313,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312497 -0.09375,1.992187475 L 0.96200781,0.03526562 C 1.0791953,0.01182812 1.3487266,1.09375e-4 1.4659141,1.09375e-4 c 0.1171875,0 0.3867187,0.011718745 0.5039062,0.035156245 L 1.9932578,1.09375e-4 C 1.9112266,-0.56239063 1.8995078,-1.2420781 1.8995078,-1.9920781 Z m 0.9257813,-3.2226562 c 0.5976562,0 1.4179687,0.234375 1.4179687,1.3125 0,1.2070312 -0.8085937,1.5117187 -1.7695312,1.5117187 H 1.8995078 v -2.3789062 c 0,-0.3867188 0.023437,-0.4453125 0.9257813,-0.4453125 z m 6.2460937,3.8320312 -0.046875,1.1835938 c 0,0.12890622 -0.058594,0.19921872 -0.140625,0.25781247 -0.28125,0.2109375 -0.609375,0.38671875 -0.9140625,0.38671875 -0.4453125,0 -0.7382812,-0.29296875 -0.7382812,-0.60937502 0,-0.4453125 0.2109375,-0.7851562 0.984375,-0.9960937 z m 0,1.75781252 c 0.1054688,0.38671875 0.3984375,0.5859375 0.7734375,0.5859375 0.2460937,0 0.5507817,-0.0703125 0.7851567,-0.31640625 l -0.07031,-0.234375 c -0.105469,0.0234375 -0.199219,0.046875 -0.269531,0.046875 -0.09375,0 -0.222656,-0.0234375 -0.2929689,-0.0820313 -0.1054688,-0.0820313 -0.1640625,-0.328125 -0.1640625,-0.77343757 0,-0.2929687 0.035156,-1.3945312 0.035156,-1.5 0,-1.3476562 -0.890625,-1.6171875 -1.6523437,-1.6171875 -0.75,0 -1.2070313,0.3632813 -1.4648438,0.5742188 l -0.023437,0.046875 0.1640625,0.65625 0.1289063,0.011719 c 0.28125,-0.4453125 0.6210937,-0.84375 1.1015625,-0.84375 0.3515625,0 0.9492187,0.046875 0.9492187,1.171875 0,0.070312 -0.035156,0.1171875 -0.070312,0.1289063 l -0.9257812,0.1992187 c -1.0078125,0.234375 -1.6523438,0.7734375 -1.6523438,1.45312507 0,0.73828125 0.515625,1.078125 1.2539063,1.078125 0.5390625,0 0.8203125,-0.140625 1.3476562,-0.5859375 z M 11.71982,-4.2772344 c -0.210937,0 -0.421875,0 -0.539062,-0.011719 -0.07031,0.1640625 -0.128906,0.2578125 -0.222656,0.3984375 l 0.04687,0.070312 c 0.175781,0 0.480468,0 0.714843,-0.011719 v 1.359375 c 0,0.5976563 -0.03516,1.3125 -0.03516,1.61718752 0,0.64453125 0.421875,0.9609375 0.878906,0.9609375 0.421875,0 0.738282,-0.105468745 1.160157,-0.375 l -0.128907,-0.234375 c -0.304687,0.09375 -0.527343,0.10546875 -0.785156,0.0703125 -0.234375,-0.0351563 -0.328125,-0.26953125 -0.328125,-0.82031252 0,-0.3046875 0.03516,-0.7265625 0.03516,-1.3242187 v -1.2539063 h 0.375 c 0.234375,0 0.574219,0.011719 0.714844,0.011719 0.03516,-0.1523438 0.05859,-0.2578125 0.117188,-0.3984375 l -0.04687,-0.070312 c -0.175782,0 -0.46875,0.011719 -0.691407,0.011719 h -0.46875 c 0,-0.7382812 0,-0.8671875 0.04687,-1.453125 0,-0.070312 -0.03516,-0.09375 -0.09375,-0.09375 -0.246093,0.09375 -0.375,0.2109375 -0.691406,0.2460938 l -0.02344,0.035156 c -0.02344,0.3398438 -0.03516,0.6914063 -0.03516,1.265625 z m 3.679688,1.5585938 c 0.175781,-1.03125 0.820312,-1.2539063 1.136719,-1.2539063 0.386718,0 0.84375,0.3632813 0.84375,1.1132813 0,0.09375 -0.04687,0.140625 -0.140625,0.140625 z m 2.660156,1.6875 c -0.351562,0.37499997 -0.808594,0.53906247 -1.359375,0.53906247 -0.351562,0 -0.820312,-0.12890625 -1.089844,-0.56249997 -0.175781,-0.28125 -0.234375,-0.6796875 -0.234375,-1.265625 h 2.707032 c 0.105468,0 0.175781,-0.058594 0.175781,-0.1757813 0,-0.8320312 -0.410156,-1.875 -1.722656,-1.875 -1.019532,0 -2.039063,0.8203125 -2.039063,2.2851563 0,0.5742187 0.105469,1.12499997 0.445313,1.52343747 0.339843,0.421875 0.914062,0.66796875 1.582031,0.66796875 0.726562,0 1.359375,-0.375 1.734375,-0.890625 z m 1.511719,-0.9609375 c 0,0.75 -0.02344,1.46484372 -0.09375,1.992187475 l 0.02344,0.035156245 c 0.117188,-0.0234375 0.339844,-0.035156245 0.46875,-0.035156245 0.117188,0 0.339844,0.011718745 0.457032,0.035156245 L 20.450289,1.09375e-4 C 20.368258,-0.56239063 20.368258,-1.2303594 20.368258,-1.9920781 v -2.9882813 c 0,-0.75 0.03516,-1.2304687 0.08203,-1.875 0,-0.070312 -0.02344,-0.09375 -0.08203,-0.09375 -0.257813,0.09375 -0.46875,0.1640625 -0.867188,0.1992188 l -0.02344,0.023437 c 0.07031,0.4335937 0.09375,1.171875 0.09375,1.921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="Ratel"
+       transform="matrix(1.3333333,0,0,1.3333333,20.494667,163.88267)" />
+    <path
+       id="path18"
+       d="M 45.35506,45.35464 88.68451,30.46008"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path19"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.2609067,0.43342667,0.43342667,-1.2609067,122.62468,179.1572)" />
+    <g
+       id="g23"
+       clip-path="url(#clipPath24)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.2096133,0,0,-0.6048,-25.865307,234.89232)"
+         style="fill:url(#linearGradient23);stroke:none"
+         id="path23" />
+    </g>
+    <path
+       id="path25"
+       d="M 0,0 V 22.67752 H 45.35506 V 0 Z"
+       style="fill:none;stroke:#782d2d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path26"
+       d="m 5.3445234,-4.4292109 v 0.796875 H 1.8992109 v -0.796875 c 0,-0.75 0.011719,-1.4648438 0.09375,-1.9921875 l -0.011719,-0.035156 c -0.1171875,0.023437 -0.3984375,0.035156 -0.515625,0.035156 -0.1171875,0 -0.3867188,-0.011719 -0.50390626,-0.035156 l -0.0234375,0.035156 c 0.0820313,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312496 -0.09375,1.9921874625 L 0.94999219,0.03563281 C 1.0671797,0.01219531 1.3484297,4.765625e-4 1.4656172,4.765625e-4 c 0.1171875,0 0.3867187,0.0117187475 0.5039062,0.0351562475 L 1.9929609,4.765625e-4 C 1.9109297,-0.56202344 1.8992109,-1.2417109 1.8992109,-1.9917109 v -1.2070313 h 3.4453125 v 1.2070313 c 0,0.75 -0.023437,1.45312496 -0.1054687,1.9921874625 L 5.2507734,0.03563281 C 5.3679609,0.01219531 5.6492109,4.765625e-4 5.7663984,4.765625e-4 c 0.1171875,0 0.3867188,0.0117187475 0.515625,0.0351562475 L 6.2937422,4.765625e-4 C 6.2117109,-0.56202344 6.1999922,-1.2417109 6.1999922,-1.9917109 v -2.4375 c 0,-0.75 0.011719,-1.4648438 0.09375,-1.9921875 l -0.011719,-0.035156 c -0.1171875,0.023437 -0.3984375,0.035156 -0.515625,0.035156 -0.1171875,0 -0.3867187,-0.011719 -0.5039062,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.1054687,1.2421875 0.1054684,1.9921875 z m 5.4287106,-1.6875 c 1.40625,0 2.296875,0.9609375 2.296875,3.0234375 0,1.7929687 -0.820312,2.75390621 -1.957031,2.75390621 -1.1953124,0 -2.3320311,-1.00781251 -2.3320311,-2.91796871 0,-2.0859375 1.03125,-2.859375 1.9921871,-2.859375 z m 3.328125,2.8359375 c 0,-1.96875 -1.40625,-3.2695313 -3.199218,-3.2695313 -1.7812504,0 -3.1523441,1.5117188 -3.1523441,3.4570313 0,1.96875 1.4414062,3.19921871 3.1640621,3.19921871 1.898438,0 3.1875,-1.39453121 3.1875,-3.38671871 z m 1.590821,-3.140625 c 0.08203,1.8867187 0.05859,4.8398437 -0.105469,6.4218749625 l 0.02344,0.0351562475 c 0.117188,-0.0234375 0.234375,-0.0351562475 0.351563,-0.0351562475 0.117187,0 0.222656,0.0117187475 0.351562,0.0351562475 L 16.324992,4.765625e-4 C 16.242961,-0.56202344 16.231242,-1.2417109 16.231242,-1.9917109 v -2.3789063 c 0,-0.609375 0.01172,-0.6445312 0.386719,-0.1640625 l 3.480469,4.39453126 c 0.105468,0.15234375 0.257812,0.24609375 0.421875,0.24609375 0.140625,0 0.175781,-0.12890625 0.1875,-0.31640625 0.04687,-2.27343746 0.04687,-4.00781246 0.199218,-6.21093746 l -0.01172,-0.035156 c -0.128907,0.023437 -0.234375,0.035156 -0.351563,0.035156 -0.117187,0 -0.234375,-0.011719 -0.351562,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.105469,1.2421875 0.105469,1.9921875 v 2.6601562 c -0.02344,0.5390625 -0.164063,0.3164063 -0.585938,-0.2578125 l -3.421875,-4.4179687 c 0,0 -0.09375,0.023437 -0.140625,0.023437 -0.339843,0 -0.410156,-0.035156 -0.410156,-0.035156 z m 7.974609,4.4296875 v -1.1601563 c 0.46875,0 1.523438,0.023437 2.214844,0.09375 l 0.03516,-0.035156 c -0.02344,-0.082031 -0.03516,-0.2226563 -0.03516,-0.3164063 0,-0.082031 0.01172,-0.2226562 0.03516,-0.3164062 l -0.03516,-0.023437 c -0.585938,0.046875 -1.042969,0.09375 -2.214844,0.09375 v -0.7734375 c 0,-0.1757813 0,-1.1953125 0.07031,-1.4414063 1.335937,0 2.71875,0.1289063 2.71875,0.1289063 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.1640625 -0.01172,-0.2578125 0,-0.082031 0,-0.234375 0.01172,-0.375 l -0.01172,-0.035156 c -0.164063,0.023437 -0.398438,0.035156 -0.597657,0.035156 h -2.625 c -0.339843,0 -0.503906,-0.035156 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312496 -0.09375,1.9921874625 L 22.71757,0.03563281 c 0,0 0.164063,-0.0351562475 0.515625,-0.0351562475 h 2.71875 c 0.210938,0 0.433594,0.0117187475 0.597657,0.0351562475 L 26.573039,4.765625e-4 C 26.56132,-0.14014844 26.549602,-0.21046094 26.549602,-0.30421094 c 0,-0.10546875 0.01172,-0.2578125 0.02344,-0.328125 l -0.02344,-0.046875 c 0,0 -1.476563,0.1171875 -2.8125,0.1171875 -0.07031,-0.234375 -0.07031,-1.26562496 -0.07031,-1.42968746 z m 5.358398,0 v -1.1601563 c 0.46875,0 1.523438,0.023437 2.214844,0.09375 l 0.03516,-0.035156 c -0.02344,-0.082031 -0.03516,-0.2226563 -0.03516,-0.3164063 0,-0.082031 0.01172,-0.2226562 0.03516,-0.3164062 l -0.03516,-0.023437 c -0.585937,0.046875 -1.042969,0.09375 -2.214844,0.09375 v -0.7734375 c 0,-0.1757813 0,-1.1953125 0.07031,-1.4414063 1.335937,0 2.71875,0.1289063 2.71875,0.1289063 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.1640625 -0.01172,-0.2578125 0,-0.082031 0,-0.234375 0.01172,-0.375 l -0.01172,-0.035156 c -0.164063,0.023437 -0.398438,0.035156 -0.597656,0.035156 h -2.625 c -0.339844,0 -0.503907,-0.035156 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312496 -0.09375,1.9921874625 l 0.01172,0.0351562475 c 0,0 0.164062,-0.0351562475 0.515625,-0.0351562475 h 2.71875 c 0.210937,0 0.433593,0.0117187475 0.597656,0.0351562475 L 31.931437,4.765625e-4 C 31.919719,-0.14014844 31.908,-0.21046094 31.908,-0.30421094 c 0,-0.10546875 0.01172,-0.2578125 0.02344,-0.328125 l -0.02344,-0.046875 c 0,0 -1.476563,0.1171875 -2.8125,0.1171875 -0.07031,-0.234375 -0.07031,-1.26562496 -0.07031,-1.42968746 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="HONEE"
+       transform="matrix(1.3333333,0,0,1.3333333,12.956,208.91733)" />
+    <path
+       id="path27"
+       d="M 45.35506,11.33856 88.68451,26.23311"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path28"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.2609067,-0.43342667,-0.43342667,-1.2609067,122.62468,184.7932)" />
+    <g
+       id="g32"
+       clip-path="url(#clipPath33)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.2096133,0,0,-0.6048,-25.865307,280.24763)"
+         style="fill:url(#linearGradient32);stroke:none"
+         id="path32" />
+    </g>
+    <path
+       id="path34"
+       d="m 0,-34.01648 v 22.67752 h 45.35506 v -22.67752 z"
+       style="fill:none;stroke:#782d2d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path35"
+       d="m 1.9222109,-6.59725 c -0.3515625,2.015625 -0.890625,4.7695312 -1.28906246,6.6328125 0.1171875,-0.03515625 0.2109375,-0.03515625 0.328125,-0.03515625 0.11718746,0 0.19921876,0 0.30468746,0.03515625 0.1523438,-1.0546875 0.4921875,-3.2695313 0.7265625,-4.78125 h 0.046875 c 0.75,1.59375 1.453125,3.1992187 2.0976563,4.74609375 h 0.1875 C 5.0393984,-1.6285 5.7425234,-3.1519375 6.5393984,-4.7105313 l 0.023437,0.011719 c 0.2109375,1.5585937 0.4101563,3.0820312 0.5625,4.734375 C 7.2776797,4.0625e-4 7.4651797,4.0625e-4 7.6058047,4.0625e-4 c 0.1523437,0 0.4101562,0 0.5625,0.03515625 C 7.8050234,-2.09725 7.5120547,-4.1714688 7.2308047,-6.59725 H 6.9847109 L 4.5589297,-1.6519375 H 4.4886172 C 3.6917422,-3.2925625 2.9768984,-4.8863125 2.2503359,-6.59725 Z m 10.9599611,0.1757812 h -2.589844 c -0.3398436,0 -0.5039061,-0.035156 -0.5039061,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312505 -0.09375,1.99218755 l 0.011719,0.03515625 c 0,0 0.1640625,-0.03515625 0.5156251,-0.03515625 0.339844,0 0.503906,0.03515625 0.503906,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.24218755 -0.09375,-1.99218755 v -1.1601562 c 0.46875,0 1.523437,0.023437 2.214844,0.09375 l 0.03516,-0.035156 C 12.952484,-3.175375 12.940766,-3.316 12.940766,-3.40975 c 0,-0.082031 0.01172,-0.2226563 0.03516,-0.3164063 l -0.03516,-0.023437 c -0.585938,0.046875 -1.042969,0.09375 -2.214844,0.09375 v -0.7734375 c 0,-0.1757812 0,-1.1953125 0.07031,-1.4414062 1.335938,0 2.71875,0.1289062 2.71875,0.1289062 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.2109375 -0.01172,-0.2929687 0,-0.082031 0,-0.1992188 0.01172,-0.3398438 l -0.01172,-0.035156 c -0.164062,0.023437 -0.398437,0.035156 -0.632812,0.035156 z m 2.865234,4.4296875 v -1.1601562 c 0.46875,0 1.523438,0.023437 2.214844,0.09375 l 0.03516,-0.035156 C 17.973969,-3.175375 17.96225,-3.316 17.96225,-3.40975 c 0,-0.082031 0.01172,-0.2226563 0.03516,-0.3164063 l -0.03516,-0.023437 c -0.585938,0.046875 -1.042969,0.09375 -2.214844,0.09375 v -0.7734375 c 0,-0.1757812 0,-1.1953125 0.07031,-1.4414062 1.335937,0 2.71875,0.1289062 2.71875,0.1289062 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.1640625 -0.01172,-0.2578125 0,-0.082031 0,-0.234375 0.01172,-0.375 l -0.01172,-0.035156 c -0.164063,0.023437 -0.398438,0.035156 -0.597657,0.035156 h -2.625 c -0.339843,0 -0.503906,-0.035156 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312505 -0.09375,1.99218755 l 0.01172,0.03515625 c 0,0 0.164063,-0.03515625 0.515625,-0.03515625 h 2.71875 c 0.210938,0 0.433594,0.01171875 0.597657,0.03515625 l 0.02344,-0.03515625 c -0.01172,-0.140625 -0.02344,-0.2109375 -0.02344,-0.3046875 0,-0.10546875 0.01172,-0.2578125 0.02344,-0.328125 l -0.02344,-0.046875 c 0,0 -1.476563,0.1171875 -2.8125,0.1171875 -0.07031,-0.234375 -0.07031,-1.26562505 -0.07031,-1.42968755 z M 21.129242,-6.59725 C 20.77768,-4.581625 20.238617,-1.8277188 19.84018,0.0355625 19.957367,4.0625e-4 20.051117,4.0625e-4 20.168305,4.0625e-4 c 0.117187,0 0.199218,0 0.304687,0.03515625 0.152344,-1.0546875 0.492188,-3.2695313 0.726563,-4.78125 h 0.04687 c 0.75,1.59375 1.453125,3.1992187 2.097656,4.74609375 h 0.1875 C 24.24643,-1.6285 24.949555,-3.1519375 25.74643,-4.7105313 l 0.02344,0.011719 c 0.210938,1.5585937 0.410156,3.0820312 0.5625,4.734375 0.152344,-0.03515625 0.339844,-0.03515625 0.480469,-0.03515625 0.152344,0 0.410156,0 0.5625,0.03515625 -0.363281,-2.1328125 -0.65625,-4.2070313 -0.9375,-6.6328125 h -0.246094 l -2.425781,4.9453125 h -0.07031 C 22.898773,-3.2925625 22.18393,-4.8863125 21.457367,-6.59725 Z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="MFEM"
+       transform="matrix(1.3333333,0,0,1.3333333,15.925333,254.42133)" />
+    <path
+       id="path36"
+       d="M 45.35506,-22.67793 89.19551,21.16212"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path37"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.9428,-0.94278667,-0.94278667,-0.9428,123.306,191.55451)" />
+    <path
+       id="path38"
+       d="m 2.2861484,-5.3202109 c 0,-0.890625 0.023437,-1.7578125 0.1171875,-2.390625 l -0.011719,-0.035156 c -0.140625,0.023437 -0.4804688,0.035156 -0.6210938,0.035156 -0.140625,0 -0.46875,-0.011719 -0.609375,-0.035156 l -0.023437,0.035156 c 0.09375,0.6796875 0.1171875,1.5 0.1171875,2.390625 v 2.9296875 c 0,0.890625 -0.023437,1.74609371 -0.1171875,2.3906249625 L 1.1494297,0.03525781 c 0,0 0.2109375,-0.0351562475 0.6210937,-0.0351562475 h 3.2578125 c 0.2578125,0 0.5273438,0.0117187475 0.7265625,0.0351562475 L 5.7783359,1.015625e-4 C 5.7666172,-0.16396094 5.7548984,-0.41005469 5.7548984,-0.52724219 c 0,-0.1171875 0.011719,-0.31640625 0.023437,-0.421875 l -0.023437,-0.046875 c 0,0 -1.7695312,0.26953125 -3.375,0.26953125 -0.082031,-0.29296876 -0.09375,-1.46484376 -0.09375,-1.66406246 z m 4.8310547,-1.8164063 c 0,0.328125 0.28125,0.609375 0.609375,0.609375 0.3164063,0 0.609375,-0.28125 0.609375,-0.609375 0,-0.3164062 -0.2929687,-0.609375 -0.609375,-0.609375 -0.328125,0 -0.609375,0.2929688 -0.609375,0.609375 z m 0.1171875,4.3242188 v 0.65625 c 0,0.9023437 -0.023437,1.51171871 -0.1054687,2.1562499625 L 7.1523594,0.03525781 c 0.140625,-0.0234375 0.4101562,-0.0351562475 0.5625,-0.0351562475 0.140625,0 0.4101562,0.0117187475 0.5625,0.0351562475 L 8.3007969,1.015625e-4 C 8.1953281,-0.66786719 8.1836094,-1.2420859 8.1836094,-2.1561484 v -0.84375 c 0,-0.9023438 0.023437,-1.3242188 0.1171875,-2.0507813 0,-0.1171875 -0.023437,-0.1289062 -0.1171875,-0.1289062 -0.2695313,0.035156 -0.7851563,0.035156 -1.03125,0.011719 l -0.023437,0.035156 c 0.070312,0.515625 0.1054687,1.4296875 0.1054682,2.3203125 z m 3.9638674,1.83984371 V -4.0663047 c 0.46875,-0.4804687 0.773437,-0.703125 1.253906,-0.703125 0.738281,0 1.382813,0.609375 1.382813,2.0976563 0,1.4765625 -0.445313,2.30859371 -1.582032,2.30859371 -0.375,0 -0.808593,-0.3046875 -1.054687,-0.609375 z m 0,-5.00390621 c 0,-0.8789063 0.01172,-1.5117188 0.07031,-2.25 0,-0.082031 -0.03516,-0.1171875 -0.105468,-0.1171875 -0.292969,0.1171875 -0.5625,0.1992187 -1.042969,0.234375 l -0.02344,0.035156 c 0.09375,0.515625 0.152344,1.40625 0.152344,2.3085938 v 4.89843746 c 0,0.45703125 -0.01172,0.64453125 -0.05859,0.890625 0.07031,0.0703125 0.199219,0.09375 0.328125,0.09375 0.128907,-0.140625 0.269532,-0.33984375 0.398438,-0.5859375 0.339844,0.3046875 0.890625,0.5859375 1.40625,0.5859375 1.21875,0 2.566406,-0.85546875 2.566406,-2.89453126 0,-1.453125 -1.03125,-2.4726562 -2.191406,-2.4726562 -0.574219,0 -1.089844,0.1523437 -1.5,0.5976562 z m 6.084961,1.9804687 v -1.0898437 c 0,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269531,0.058594 -0.703125,0.070312 -0.960937,0.035156 l -0.02344,0.035156 c 0.08203,0.5273437 0.105469,1.4296875 0.105469,2.3203125 v 0.65625 c 0,0.9023437 -0.02344,1.52343746 -0.105469,2.1562499625 l 0.02344,0.0351562475 c 0.140625,-0.0234375 0.410156,-0.0351562475 0.5625,-0.0351562475 0.140625,0 0.410156,0.0117187475 0.550781,0.0351562475 L 17.400406,1.015625e-4 C 17.294937,-0.67958594 17.283219,-1.2420859 17.283219,-2.1561484 v -0.6210938 c 0,-0.4921875 0.152343,-0.7851562 0.410156,-1.1835937 0.164062,-0.2578125 0.445312,-0.4101563 0.667969,-0.4101563 0.246093,0 0.46875,0.023437 0.621093,0.1640625 l 0.09375,-0.023437 0.246094,-0.890625 -0.04687,-0.046875 c -0.210937,-0.058594 -0.222656,-0.082031 -0.433594,-0.082031 -0.644531,0 -0.996093,0.375 -1.523437,1.2773437 z m 5.956054,1.3007813 -0.04687,1.4296875 c 0,0.1523437 -0.07031,0.234375 -0.164062,0.30468746 -0.339844,0.2578125 -0.738281,0.46875 -1.101563,0.46875 -0.539062,0 -0.878906,-0.3515625 -0.878906,-0.72656246 0,-0.5390625 0.246094,-0.9492188 1.183594,-1.1953125 z m 0,2.12109371 c 0.128907,0.46875 0.492188,0.69140625 0.9375,0.69140625 0.292969,0 0.667969,-0.0703125 0.9375,-0.36328125 l -0.08203,-0.29296875 c -0.128906,0.0351563 -0.234375,0.0585938 -0.316406,0.0585938 -0.128906,0 -0.28125,-0.0234375 -0.363281,-0.09375 -0.117188,-0.10546875 -0.1875,-0.3984375 -0.1875,-0.93750006 0,-0.3398437 0.03516,-1.6640625 0.03516,-1.8046875 0,-1.6054687 -1.066406,-1.9335937 -1.96875,-1.9335937 -0.914063,0 -1.464844,0.4453125 -1.757813,0.6914063 l -0.03516,0.046875 0.199219,0.8085937 0.152344,0.011719 c 0.339843,-0.5390624 0.738281,-1.0195312 1.3125,-1.0195312 0.433593,0 1.148437,0.058594 1.148437,1.40625 0,0.09375 -0.04687,0.140625 -0.08203,0.1523437 l -1.113281,0.2460938 c -1.21875,0.28125 -1.980469,0.9257812 -1.980469,1.734375 0,0.89062501 0.609375,1.28906251 1.488281,1.28906251 0.667969,0 0.996094,-0.15234375 1.628906,-0.69140625 z m 4.03711,-3.42187501 v -1.0898437 c 0,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269531,0.058594 -0.703125,0.070312 -0.960937,0.035156 l -0.02344,0.035156 c 0.08203,0.5273437 0.105469,1.4296875 0.105469,2.3203125 v 0.65625 c 0,0.9023437 -0.02344,1.52343746 -0.105469,2.1562499625 l 0.02344,0.0351562475 c 0.140625,-0.0234375 0.410156,-0.0351562475 0.5625,-0.0351562475 0.140625,0 0.410156,0.0117187475 0.550781,0.0351562475 L 27.39357,1.015625e-4 C 27.288102,-0.67958594 27.276383,-1.2420859 27.276383,-2.1561484 v -0.6210938 c 0,-0.4921875 0.152344,-0.7851562 0.410156,-1.1835937 0.164063,-0.2578125 0.445313,-0.4101563 0.667969,-0.4101563 0.246094,0 0.46875,0.023437 0.621094,0.1640625 l 0.09375,-0.023437 0.246093,-0.890625 -0.04687,-0.046875 c -0.210937,-0.058594 -0.222656,-0.082031 -0.433593,-0.082031 -0.644532,0 -0.996094,0.375 -1.523438,1.2773437 z m 2.771484,-1.171875 c 0.457031,1.0664063 1.886719,4.1484375 2.308594,5.28515626 -0.375,0.90234374 -0.867188,1.81640624 -1.40625,2.71875004 0.117187,-0.035156 0.304687,-0.058594 0.421875,-0.058594 0.117187,0 0.46875,0.023437 0.585937,0.058594 0.457032,-1.5 3.070313,-7.1953125 3.46875,-8.0039063 -0.128906,0.035156 -0.398437,0.035156 -0.515625,0.035156 -0.117187,0 -0.351562,0 -0.46875,-0.035156 -0.433593,1.359375 -0.996093,2.8828125 -1.582031,4.125 h -0.02344 c -0.585938,-1.4179687 -1.148438,-2.7421875 -1.582032,-4.125 -0.164062,0.035156 -0.421875,0.035156 -0.585937,0.035156 -0.164063,0 -0.457031,0 -0.621094,-0.035156 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="Library"
+       transform="matrix(1.3333333,0,0,1.3333333,131.72133,15.605333)" />
+    <g
+       id="g42"
+       clip-path="url(#clipPath43)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.2096133,0,0,-0.6048,95.08096,212.21524)"
+         style="fill:url(#linearGradient42);stroke:none"
+         id="path42" />
+    </g>
+    <path
+       id="path44"
+       d="m 90.7097,17.00783 v 22.67752 h 45.35506 V 17.00783 Z"
+       style="fill:none;stroke:#3c5a8a;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path45"
+       d="m 0.9021875,-1.9913125 c 0,0.75 -0.0234375,1.46484375 -0.09375,1.9921875 L 0.831875,0.03603125 C 0.9490625,0.01259375 1.1717188,8.75e-4 1.300625,8.75e-4 c 0.1171875,0 0.3398438,0.01171875 0.4570313,0.03515625 L 1.7810938,8.75e-4 c -0.082031,-0.5625 -0.082031,-1.2304687 -0.082031,-1.9921875 v -2.9882812 c 0,-0.75 0.035156,-1.2304688 0.082031,-1.875 0,-0.070312 -0.023437,-0.09375 -0.082031,-0.09375 -0.2578125,0.09375 -0.46875,0.1640625 -0.8671875,0.1992187 l -0.0234375,0.023437 c 0.0703125,0.4335938 0.09375,1.171875 0.09375,1.921875 z m 2.4052734,-3.9492187 c 0,0.2578125 0.234375,0.5039062 0.5039063,0.5039062 0.2695312,0 0.515625,-0.2460937 0.515625,-0.5039062 0,-0.2695313 -0.2460938,-0.515625 -0.515625,-0.515625 -0.2695313,0 -0.5039063,0.2460937 -0.5039063,0.515625 z m 0.1054688,3.5976562 v 0.5507813 c 0,0.75 -0.023437,1.2539062 -0.09375,1.7929687 l 0.023437,0.03515625 C 3.4598047,0.01259375 3.6824609,8.75e-4 3.8113672,8.75e-4 c 0.1171875,0 0.3398437,0.01171875 0.4570312,0.03515625 L 4.2918359,8.75e-4 c -0.082031,-0.5625 -0.082031,-1.03125 -0.082031,-1.7929687 v -0.703125 c 0,-0.75 0.011719,-1.1132813 0.082031,-1.7226563 0,-0.082031 -0.023437,-0.09375 -0.082031,-0.09375 -0.234375,0.023437 -0.6679688,0.023437 -0.8671875,0.011719 l -0.023437,0.023437 c 0.058594,0.4335937 0.09375,1.1835937 0.093749,1.933594 z m 3.3046875,1.53515625 V -3.3858437 c 0.3867187,-0.3984375 0.6328125,-0.5859375 1.0429687,-0.5859375 0.609375,0 1.1484375,0.5039062 1.1484375,1.7460937 0,1.23046875 -0.375,1.921875 -1.3125,1.921875 -0.3164062,0 -0.6796875,-0.2578125 -0.8789062,-0.50390625 z m 0,-4.17187495 c 0,-0.7382813 0.011719,-1.2539063 0.058594,-1.875 0,-0.070312 -0.035156,-0.09375 -0.09375,-0.09375 -0.2460937,0.09375 -0.46875,0.1640625 -0.8671875,0.1992187 l -0.011719,0.023437 c 0.058594,0.4335938 0.1171875,1.171875 0.1171875,1.921875 v 4.078125 c 0,0.38671875 -0.011719,0.5390625 -0.058594,0.75 0.070312,0.05859375 0.1757813,0.08203125 0.28125,0.08203125 0.1171875,-0.12890625 0.2226563,-0.29296875 0.328125,-0.4921875 0.2929688,0.24609375 0.75,0.4921875 1.1835938,0.4921875 1.0078125,0 2.1328125,-0.7265625 2.1328125,-2.41406245 0,-1.21875 -0.8671875,-2.0625 -1.828125,-2.0625 -0.4804688,0 -0.9023438,0.1289062 -1.2421875,0.4921875 z M 13.90707,-6.5499062 c -1.6875,0 -3.269531,1.453125 -3.269531,3.4101562 0,1.7226563 0.984375,3.24609375 3.128906,3.24609375 0.914063,0 1.78125,-0.28125 2.449219,-1.08984375 -0.01172,-0.1171875 -0.02344,-0.3398437 -0.05859,-0.4335937 l -0.07031,-0.023437 c -0.703125,0.77343745 -1.300781,1.03124995 -2.226563,1.03124995 -1.265625,0 -2.191406,-1.35937495 -2.191406,-2.91796875 0,-2.015625 1.277344,-2.7539062 2.097656,-2.7539062 0.902344,0 1.628907,0.3515625 2.039063,1.171875 L 15.922695,-4.921 c 0.02344,-0.5039062 0.07031,-0.6796875 0.175782,-1.1015625 l -0.02344,-0.035156 c 0,0 -0.949219,-0.4921875 -2.167969,-0.4921875 z m 4.857422,4.5585937 v -1.1601562 c 0.46875,0 1.523438,0.023437 2.214844,0.09375 l 0.03516,-0.035156 c -0.02344,-0.082031 -0.03516,-0.2226562 -0.03516,-0.3164062 0,-0.082031 0.01172,-0.2226563 0.03516,-0.3164063 l -0.03516,-0.023437 c -0.585938,0.046875 -1.042969,0.09375 -2.214844,0.09375 v -0.7734375 c 0,-0.1757812 0,-1.1953125 0.07031,-1.4414062 1.335937,0 2.71875,0.1289062 2.71875,0.1289062 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.1640625 -0.01172,-0.2578125 0,-0.082031 0,-0.234375 0.01172,-0.375 l -0.01172,-0.035156 C 21.389492,-6.4327187 21.155117,-6.421 20.955898,-6.421 h -2.625 c -0.339843,0 -0.503906,-0.035156 -0.503906,-0.035156 L 17.803555,-6.421 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.453125 -0.09375,1.9921875 l 0.01172,0.03515625 c 0,0 0.164063,-0.03515625 0.515625,-0.03515625 h 2.71875 c 0.210938,0 0.433594,0.01171875 0.597657,0.03515625 L 21.670742,8.75e-4 c -0.01172,-0.140625 -0.02344,-0.2109375 -0.02344,-0.3046875 0,-0.10546875 0.01172,-0.2578125 0.02344,-0.328125 l -0.02344,-0.046875 c 0,0 -1.476563,0.1171875 -2.8125,0.1171875 -0.07031,-0.234375 -0.07031,-1.265625 -0.07031,-1.4296875 z m 5.361328,0 v -1.1601562 c 0.46875,0 1.523438,0.023437 2.214844,0.09375 l 0.03516,-0.035156 c -0.02344,-0.082031 -0.03516,-0.2226562 -0.03516,-0.3164062 0,-0.082031 0.01172,-0.2226563 0.03516,-0.3164063 l -0.03516,-0.023437 c -0.585937,0.046875 -1.042969,0.09375 -2.214844,0.09375 v -0.7734375 c 0,-0.1757812 0,-1.1953125 0.07031,-1.4414062 1.335937,0 2.71875,0.1289062 2.71875,0.1289062 l 0.01172,-0.046875 c -0.01172,-0.082031 -0.01172,-0.1640625 -0.01172,-0.2578125 0,-0.082031 0,-0.234375 0.01172,-0.375 l -0.01172,-0.035156 C 26.75082,-6.4327187 26.516445,-6.421 26.317227,-6.421 h -2.625 c -0.339844,0 -0.503907,-0.035156 -0.503907,-0.035156 L 23.164883,-6.421 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.453125 -0.09375,1.9921875 l 0.01172,0.03515625 c 0,0 0.164062,-0.03515625 0.515625,-0.03515625 h 2.71875 c 0.210937,0 0.433593,0.01171875 0.597656,0.03515625 L 27.03207,8.75e-4 c -0.01172,-0.140625 -0.02344,-0.2109375 -0.02344,-0.3046875 0,-0.10546875 0.01172,-0.2578125 0.02344,-0.328125 l -0.02344,-0.046875 c 0,0 -1.476563,0.1171875 -2.8125,0.1171875 C 24.12582,-0.796 24.12582,-1.82725 24.12582,-1.9913125 Z m 5.358399,1.171875 v -4.7929687 c 0,-0.3515625 0.339844,-0.3984375 0.832031,-0.3984375 2.15625,0 2.800781,1.6523437 2.800781,3.1875 0,2.0039062 -1.148437,2.4023437 -2.554687,2.4023437 -0.972656,0 -1.078125,-0.0820313 -1.078125,-0.3984375 z M 29.050625,-6.421 c -0.339844,0 -0.503906,-0.035156 -0.503906,-0.035156 L 28.523281,-6.421 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.453125 -0.09375,1.9921875 L 28.535,0.03603125 c 0,0 0.164063,-0.03515625 0.515625,-0.03515625 0.703125,0 0.796875,0.0234375 1.828125,0.0234375 1.394531,0 3.257813,-0.64453125 3.257813,-3.09375 0,-1.8515625 -1.535157,-3.3867187 -3.445313,-3.3867187 -0.632812,0 -1.007812,0.035156 -1.640625,0.035156 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="libCEED"
+       transform="matrix(1.3333333,0,0,1.3333333,132.45333,186.56133)" />
+    <path
+       id="path46"
+       d="m 136.06476,28.3468 44.48515,100.09139"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path47"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.54150667,-1.2183867,-1.2183867,-0.54150667,245.11188,48.519773)" />
+    <path
+       id="path48"
+       d="m 136.06476,28.3468 44.16698,66.25003"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path49"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.73956,-1.10936,-1.10936,-0.73956,244.68763,93.64156)" />
+    <path
+       id="path50"
+       d="M 136.06476,28.3468 179.7063,61.07774"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path51"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.0666267,-0.79998667,-0.79998667,-1.0666267,243.98707,138.33368)" />
+    <path
+       id="path52"
+       d="m 136.06476,28.3468 h 43.21309"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path53"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.3333333,0,0,-1.3333333,243.4158,181.97493)" />
+    <path
+       id="path54"
+       d="M 136.06476,28.3468 179.7063,-4.38412"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path55"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.0666267,0.79998667,0.79998667,-1.0666267,243.98707,225.61616)" />
+    <path
+       id="path56"
+       d="m 136.06476,28.3468 44.16698,-66.25001"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path57"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.73956,1.10936,1.10936,-0.73956,244.68763,270.30828)" />
+    <path
+       id="path58"
+       d="M 136.06476,28.3468 180.54991,-71.74457"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path59"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.54150667,1.2183867,1.2183867,-0.54150667,245.11188,315.43007)" />
+    <path
+       id="path60"
+       d="m 2.2725703,-6.6798594 c 0,-0.3984375 0.1171875,-0.5507812 1.0898438,-0.5507812 0.6679687,0 1.5117187,0.2578125 1.5117187,1.4765625 0,1.0195312 -0.6796875,1.3242187 -1.6757812,1.3242187 H 2.2725703 Z m 0,2.7539063 h 1.1132813 c 1.3945312,0 2.0507812,0.8789062 2.0507812,1.9570312 0,0.8203125 -0.28125,1.48828127 -2.0039062,1.48828127 -0.84375,0 -1.1601563,-0.17578125 -1.1601563,-0.57421877 z m -0.515625,-3.7851563 c -0.4101562,0 -0.609375,-0.035156 -0.609375,-0.035156 l -0.023437,0.035156 c 0.09375,0.6796875 0.1171875,1.5 0.1171875,2.390625 v 2.9296875 c 0,0.890625 -0.023437,1.74609377 -0.1171875,2.390625025 l 0.011719,0.035156245 c 0,0 0.1992187,-0.035156245 0.6210937,-0.035156245 0.7382813,0 0.8320313,0.035156245 1.8984375,0.035156245 2.3789063,0 2.9648438,-1.20703127 2.9648438,-2.23828127 0,-1.1484375 -0.7734375,-1.8046875 -1.8164063,-2.0976562 0.609375,-0.3046875 1.1132813,-0.9375 1.1132813,-1.59375 0,-0.796875 -0.421875,-1.8515625 -2.6484375,-1.8515625 -0.4101563,0 -0.9140625,0.035156 -1.5117195,0.035156 z m 9.3105467,5.015625 -0.04687,1.4296875 c 0,0.1523438 -0.07031,0.234375 -0.164062,0.30468752 -0.339844,0.2578125 -0.738282,0.46875 -1.1015628,0.46875 -0.5390625,0 -0.8789063,-0.3515625 -0.8789063,-0.72656252 0,-0.5390625 0.2460938,-0.9492187 1.1835941,-1.1953125 z m 0,2.12109377 c 0.128906,0.46875 0.492188,0.69140625 0.9375,0.69140625 0.292969,0 0.667969,-0.0703125 0.9375,-0.36328125 l -0.08203,-0.29296875 c -0.128906,0.0351563 -0.234375,0.0585938 -0.316406,0.0585938 -0.128907,0 -0.28125,-0.0234375 -0.363282,-0.09375 -0.117187,-0.10546875 -0.1875,-0.3984375 -0.1875,-0.93749992 0,-0.3398438 0.03516,-1.6640626 0.03516,-1.8046876 0,-1.6054687 -1.066407,-1.9335937 -1.96875,-1.9335937 -0.9140628,0 -1.4648441,0.4453125 -1.7578128,0.6914062 l -0.035156,0.046875 0.1992188,0.8085937 0.1523437,0.011719 c 0.3398438,-0.5390625 0.7382813,-1.0195313 1.3125,-1.0195313 0.4335933,0 1.1484373,0.058594 1.1484373,1.40625 0,0.09375 -0.04687,0.140625 -0.08203,0.1523438 l -1.1132813,0.2460937 c -1.21875,0.28125 -1.9804687,0.9257813 -1.9804687,1.734375 0,0.89062507 0.609375,1.28906257 1.4882812,1.28906257 0.6679688,0 0.9960938,-0.15234375 1.6289058,-0.69140625 z m 5.036133,-4.19531247 c 0.457031,0 0.867188,0.3515625 1.195313,0.8789062 l 0.152343,-0.011719 0.28125,-0.9726563 -0.02344,-0.035156 c -0.375,-0.1992188 -0.996094,-0.3398438 -1.59375,-0.3398438 -1.242188,0 -2.542969,1.0195313 -2.542969,2.6484375 0,1.65234377 0.9375,2.71875002 2.402344,2.71875002 0.714844,0 1.277344,-0.234375 1.757812,-0.84375 l -0.234375,-0.2578125 h -0.04687 c -0.445312,0.41015625 -0.832031,0.50390625 -1.277343,0.50390625 -0.867188,0 -1.546875,-0.78515627 -1.546875,-2.17968747 0,-1.3242188 0.691406,-2.109375 1.476562,-2.109375 z m 3.216797,2.3789062 c 0,0.890625 -0.02344,1.75781252 -0.105469,2.390625025 l 0.02344,0.035156245 c 0.140625,-0.0234375 0.410156,-0.035156245 0.5625,-0.035156245 0.140625,0 0.410156,0.011718745 0.5625,0.035156245 l 0.02344,-0.035156245 C 20.281359,-0.67985938 20.269641,-1.4767344 20.269641,-2.3907969 v -0.1171875 c 0.105468,0.011719 0.386718,0.046875 0.492187,0.1289063 0.785156,0.8085937 1.066406,1.2421875 1.875,2.41406247 0.1875,-0.01171875 0.585938,-0.035156245 0.796875,-0.035156245 0.1875,0 0.609375,0.023437495 0.714844,0.035156245 l 0.02344,-0.035156245 C 23.269641,-0.93767188 22.636828,-1.4532969 21.500109,-2.8478281 c 0.46875,-0.4921875 1.640625,-1.6171875 2.425782,-2.2851563 l -0.03516,-0.035156 c -0.222656,0.035156 -0.878906,0.035156 -1.207031,0.035156 -0.515625,0.7617188 -1.417969,1.7226563 -1.886719,2.0859375 -0.152343,0.1171875 -0.363281,0.1640625 -0.527343,0.1757813 v -3.1054688 c 0,-0.9023437 0.04687,-1.4765625 0.117187,-2.25 0,-0.082031 -0.03516,-0.1171875 -0.117187,-0.1171875 -0.292969,0.1171875 -0.5625,0.1992188 -1.03125,0.234375 l -0.02344,0.035156 c 0.08203,0.515625 0.105469,1.40625 0.105469,2.3085937 z m 6.436523,-0.8671875 c 0.199219,-1.2539062 0.972657,-1.5117187 1.359375,-1.5117187 0.46875,0 0.996094,0.4453125 0.996094,1.3359375 0,0.1054687 -0.04687,0.1757812 -0.164062,0.1757812 z m 3.1875,2.015625 c -0.421875,0.45703127 -0.984375,0.65625002 -1.640625,0.65625002 -0.421875,0 -0.984375,-0.15234375 -1.300781,-0.67968752 -0.210937,-0.3398437 -0.292969,-0.8085937 -0.292969,-1.5234375 h 3.257813 c 0.128906,0 0.210937,-0.070312 0.210937,-0.1992187 0,-1.0078125 -0.492187,-2.2617188 -2.0625,-2.2617188 -1.230468,0 -2.449218,0.9960938 -2.449218,2.7539063 0,0.6796875 0.128906,1.3476562 0.539062,1.82812497 0.398438,0.50390625 1.078125,0.78515625 1.898438,0.78515625 0.855468,0 1.617187,-0.4453125 2.074218,-1.06640625 z m 6.225586,-0.9140625 c 0,-0.375 0.02344,-0.8203125 0.02344,-1.2070312 0,-1.2421875 -0.398438,-1.8867188 -1.476563,-1.8867188 -0.457031,0 -1.300781,0.1875 -2.074218,1.078125 l -0.02344,-0.035156 v -0.8789063 c -0.01172,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269532,0.058594 -0.703125,0.070312 -0.960938,0.035156 l -0.03516,0.035156 c 0.09375,0.5273438 0.117187,1.4296875 0.117187,2.3203125 v 0.65625 c 0,0.9023438 -0.01172,1.51171877 -0.117187,2.156250025 L 30.588,0.03498437 c 0.140625,-0.0234375 0.410156,-0.035156245 0.550781,-0.035156245 0.152344,0 0.421875,0.011718745 0.5625,0.035156245 l 0.02344,-0.035156245 C 31.630969,-0.67985938 31.61925,-1.2423594 31.61925,-2.1564219 v -1.3945312 c 0.609375,-0.7148438 1.300781,-0.984375 1.734375,-0.984375 0.609375,0 0.867188,0.2460937 0.867188,1.2070312 v 1.171875 c 0,0.9023438 -0.02344,1.52343752 -0.117188,2.156250025 l 0.02344,0.035156245 c 0.152343,-0.0234375 0.421875,-0.035156245 0.5625,-0.035156245 0.152343,0 0.421875,0.011718745 0.5625,0.035156245 L 35.2755,-1.71875e-4 C 35.18175,-0.67985938 35.170031,-1.2423594 35.170031,-2.1564219 Z m 5.258789,0.8085938 c -0.445312,0.56249997 -0.984375,0.84374997 -1.464843,0.84374997 -0.644532,0 -1.195313,-0.64453127 -1.195313,-2.13281247 0,-1.7929688 0.9375,-2.1328125 1.523438,-2.1328125 0.5625,0 0.855468,0.234375 1.136718,0.6914062 z m 0,0.65624997 h 0.02344 l 0.05859,0.691406255 c 0,0.023437495 0.03516,0.035156245 0.117187,0.035156245 0.152344,-0.01171875 0.234375,-0.035156245 0.398438,-0.035156245 0.152343,0 0.386718,0.011718745 0.539062,0.035156245 l 0.02344,-0.035156245 C 41.495227,-0.51579688 41.389758,-1.3947031 41.389758,-2.2970469 v -3.6796875 c 0,-0.8789062 0.04687,-1.5117187 0.105469,-2.25 0,-0.082031 -0.03516,-0.1171875 -0.105469,-0.1171875 -0.304688,0.1171875 -0.5625,0.1992188 -1.042969,0.234375 l -0.02344,0.035156 c 0.08203,0.515625 0.105468,1.40625 0.105468,2.3085937 v 0.7851563 c -0.269531,-0.1640625 -0.761718,-0.2695313 -0.996093,-0.2695313 -1.570313,0 -2.707032,1.078125 -2.707032,2.7070313 0,1.4648437 0.867188,2.66015622 2.167969,2.66015622 0.574219,0 1.113281,-0.24609375 1.535156,-0.80859375 z m 2.622071,-0.51562497 -0.199219,1.00781247 c 0.679687,0.234375 1.347656,0.31640625 1.804687,0.31640625 1.640625,0 2.027344,-0.97265625 2.027344,-1.64062502 0,-0.9726562 -0.773437,-1.359375 -1.628906,-1.5585937 -0.445313,-0.1054688 -1.101563,-0.3398438 -1.101563,-0.9375 0,-0.4921875 0.386719,-0.75 0.902344,-0.75 0.621094,0 1.007813,0.4921875 1.277344,0.8085937 l 0.164062,-0.011719 0.246094,-0.8789063 -0.02344,-0.035156 C 46.168075,-5.062672 45.511825,-5.250172 44.90245,-5.250172 c -0.878906,0 -1.804687,0.46875 -1.804687,1.4648438 0,0.9492187 0.703125,1.2773437 1.429687,1.4648437 0.703125,0.1875 1.21875,0.3867188 1.21875,0.9726563 0,0.63281247 -0.46875,0.98437497 -1.101562,0.98437497 -0.574219,0 -1.078125,-0.38671875 -1.417969,-0.85546877 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="Backends"
+       transform="matrix(1.3333333,0,0,1.3333333,250.716,17.398667)" />
+    <g
+       id="g64"
+       clip-path="url(#clipPath65)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,76.15032)"
+         style="fill:url(#linearGradient64);stroke:none"
+         id="path64" />
+    </g>
+    <path
+       id="path66"
+       d="m 181.4194,119.0565 v 22.67752 h 56.69362 V 119.0565 Z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path67"
+       d="m 1.8978125,-5.6261641 c 0,-0.2929687 0.1757812,-0.4453125 0.890625,-0.4453125 0.6914062,0 1.4179687,0.1875 1.4179687,1.3828125 0,1.1367188 -0.5625,1.5117188 -1.5234375,1.5117188 -0.2578125,0 -0.65625,-0.035156 -0.7851562,-0.1054688 z M 1.030625,-4.4308516 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.99218754 L 0.9603125,0.03399219 C 1.0775,0.01055469 1.3470312,-0.00116406 1.4642187,-0.00116406 c 0.1171875,0 0.3867188,0.01171875 0.5039063,0.03515625 l 0.023437,-0.03515625 c -0.082031,-0.5625 -0.09375,-1.24218754 -0.09375,-1.99218754 v -0.8554687 c 0.2109375,0.070312 0.4804687,0.09375 0.8320312,0.09375 1.8164063,0 2.4375,-1.1367188 2.4375,-1.9921875 0,-0.7382813 -0.46875,-1.7460938 -2.3085937,-1.7460938 -0.2578125,0 -1.1015625,0.070312 -1.3945313,0.070312 -0.1171875,0 -0.3867187,-0.011719 -0.5039062,-0.035156 l -0.0234375,0.035156 c 0.0820312,0.5625 0.09375,1.2421875 0.0937505,1.992188 z m 8.1708984,3.51562504 0.046875,0.0234375 0.046875,0.890625 c 0,0.0234375 0.011719,0.03515625 0.035156,0.03515625 0.1289062,-0.01171875 0.2578125,-0.03515625 0.3867187,-0.03515625 0.140625,0 0.3281249,0.01171875 0.4570319,0.03515625 L 10.1859,-0.00116406 C 10.115586,-0.43475781 10.021836,-1.1613203 10.021836,-1.9113203 v -0.5742188 c 0,-0.7382812 0.01172,-1.265625 0.09375,-1.7929687 l -0.02344,-0.023437 c -0.1171871,0.011719 -0.3515621,0.023437 -0.4687496,0.023437 -0.1171875,0 -0.3515625,-0.011719 -0.46875,-0.023437 l -0.023437,0.023437 c 0.070312,0.5859375 0.09375,1.0429687 0.09375,1.7929687 v 0.984375 c -0.3046875,0.51562504 -0.8789062,1.04296879 -1.359375,1.04296879 -0.3515625,0 -0.6679687,-0.10546875 -0.6679687,-1.04296879 v -0.984375 c 0,-0.7382812 0,-1.265625 0.082031,-1.7929687 l -0.011719,-0.023437 c -0.1289063,0.011719 -0.3515625,0.023437 -0.46875,0.023437 -0.1289063,0 -0.3515625,-0.011719 -0.46875,-0.023437 l -0.023437,0.023437 c 0.082031,0.5742187 0.09375,1.0429687 0.09375,1.7929687 v 1.1953125 c 0,0.69140629 0.3164062,1.39453129 1.3007812,1.39453129 0.6210938,0 1.171875,-0.515625 1.5,-1.01953125 z M 12.582383,-3.3292891 v -0.9023437 c 0,-0.070312 -0.02344,-0.1054688 -0.05859,-0.1054688 -0.234375,0.046875 -0.585937,0.058594 -0.808594,0.035156 l -0.02344,0.023437 c 0.07031,0.4453125 0.09375,1.1835937 0.09375,1.9335937 v 0.5507813 c 0,0.75 -0.02344,1.26562499 -0.09375,1.79296874 l 0.02344,0.03515625 c 0.117188,-0.0234375 0.351563,-0.03515625 0.46875,-0.03515625 0.117188,0 0.351563,0.01171875 0.46875,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.03125004 -0.09375,-1.79296874 v -0.515625 c 0,-0.421875 0.117187,-0.65625 0.339844,-0.984375 0.140625,-0.2226563 0.363281,-0.3515625 0.550781,-0.3515625 0.199219,0 0.398437,0.023437 0.527344,0.140625 l 0.07031,-0.023437 0.199219,-0.7382813 -0.03516,-0.035156 c -0.164063,-0.046875 -0.175782,-0.070312 -0.363282,-0.070312 -0.527343,0 -0.820312,0.3046875 -1.253906,1.0664062 z m 3.035156,0.609375 c 0.175781,-1.03125 0.820313,-1.2539062 1.136719,-1.2539062 0.386719,0 0.84375,0.3632812 0.84375,1.1132812 0,0.09375 -0.04687,0.140625 -0.140625,0.140625 z m 2.660156,1.6875 c -0.351562,0.37500004 -0.808593,0.53906254 -1.359375,0.53906254 -0.351562,0 -0.820312,-0.12890625 -1.089843,-0.56250004 -0.175782,-0.28125 -0.234375,-0.6796875 -0.234375,-1.265625 h 2.707031 c 0.105469,0 0.175781,-0.058594 0.175781,-0.1757812 0,-0.8320313 -0.410156,-1.875 -1.722656,-1.875 -1.019531,0 -2.039063,0.8203125 -2.039063,2.2851562 0,0.5742188 0.105469,1.12500004 0.445313,1.52343754 0.339844,0.421875 0.914062,0.66796875 1.582031,0.66796875 0.726563,0 1.359375,-0.375 1.734375,-0.890625 z m 6.887696,-5.5195312 c -1.6875,0 -3.269532,1.453125 -3.269532,3.4101562 0,1.7226563 0.984375,3.24609379 3.128907,3.24609379 0.914062,0 1.78125,-0.28125 2.449218,-1.08984375 -0.01172,-0.11718754 -0.02344,-0.33984374 -0.05859,-0.43359374 l -0.07031,-0.023437 c -0.703125,0.77343749 -1.300781,1.03124999 -2.226562,1.03124999 -1.265625,0 -2.191407,-1.35937499 -2.191407,-2.91796879 0,-2.015625 1.277344,-2.7539062 2.097657,-2.7539062 0.902343,0 1.628906,0.3515625 2.039062,1.171875 l 0.117188,-0.011719 c 0.02344,-0.5039062 0.07031,-0.6796875 0.175781,-1.1015625 l -0.02344,-0.035156 c 0,0 -0.949218,-0.4921875 -2.167968,-0.4921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="PureC"
+       transform="matrix(1.3333333,0,0,1.3333333,265.31333,50.177333)" />
+    <path
+       id="path68"
+       d="m 238.11302,130.39507 32.40416,-28.35381"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path69"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.0034,0.87797333,0.87797333,-1.0034,365.06824,83.715627)" />
+    <g
+       id="g73"
+       clip-path="url(#clipPath74)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,121.50511)"
+         style="fill:url(#linearGradient73);stroke:none"
+         id="path73" />
+    </g>
+    <path
+       id="path75"
+       d="m 181.4194,85.04042 v 22.67752 h 56.69362 V 85.04042 Z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path76"
+       d="m 4.0532812,-2.74175 c -0.328125,0.011719 -0.75,0.023437 -1.03125,0.023437 -0.2929687,0 -0.8320312,-0.011719 -1.0546875,-0.035156 l 1.0546875,-2.4375 h 0.023437 c 0.4101563,0.9375 0.7382813,1.7578125 1.007813,2.449219 z M 1.7798437,-2.273 c 0.2460938,-0.023437 0.9375,-0.023437 1.2890625,-0.023437 0.375,0 0.9257813,0 1.1601563,0.023437 C 4.72125,-0.9605 4.9439062,-0.1870625 5.0142187,0.03559375 5.1782812,4.375e-4 5.3423437,4.375e-4 5.5064062,4.375e-4 c 0.1757813,0 0.4335938,0 0.5976563,0.03515625 C 5.611875,-0.97221875 4.2876562,-4.3706563 3.3735937,-6.5503438 h -0.28125 C 2.143125,-4.3472188 1.15875,-2.1675313 0.16265625,0.03559375 0.27984375,4.375e-4 0.39703125,4.375e-4 0.5025,4.375e-4 c 0.1171875,0 0.328125,0 0.45703125,0.03515625 C 1.111875,-0.57378125 1.4282812,-1.3940938 1.7798437,-2.273 Z m 6.5947266,0.9609375 h -0.011719 c -1.3125,-3.0351563 -1.8164063,-4.7929688 -1.9101563,-5.1445313 -0.1640625,0.023437 -0.3984375,0.035156 -0.5390625,0.035156 -0.1640625,0 -0.4453125,-0.011719 -0.5976562,-0.035156 C 5.8081638,-5.4487813 7.1206638,-2.0855 8.046445,0.10590625 H 8.3159766 C 9.2769141,-2.1089375 10.24957,-4.2534688 11.257383,-6.4565938 c -0.117188,0.023437 -0.304688,0.035156 -0.398438,0.035156 -0.105468,0 -0.339843,-0.011719 -0.445312,-0.035156 -0.351563,1.2890625 -1.3242189,3.4921875 -2.0390627,5.1445313 z M 17.049375,0.03559375 C 16.2525,-1.101125 15.59625,-2.1089375 14.729062,-3.4331563 c 0.925782,-1.3359375 1.722657,-2.4140625 2.167969,-3.0234375 -0.117187,0.035156 -0.316406,0.035156 -0.421875,0.035156 -0.105469,0 -0.304687,0 -0.421875,-0.035156 -0.539062,1.0078125 -0.785156,1.3710938 -1.605469,2.5664063 -0.65625,-0.9960938 -1.324218,-1.96875 -1.605468,-2.5664063 -0.175782,0.035156 -0.398438,0.035156 -0.574219,0.035156 -0.164063,0 -0.386719,0 -0.550781,-0.035156 l 2.144531,3.2578125 -2.191406,3.23437505 C 11.775937,4.375e-4 11.975156,4.375e-4 12.080625,4.375e-4 c 0.117187,0 0.316406,0 0.421875,0.03515625 0.527344,-0.984375 1.101562,-1.92187505 1.664062,-2.76562505 0.632813,0.9609375 1.207032,1.7695313 1.734375,2.76562505 C 16.076719,4.375e-4 16.299375,4.375e-4 16.475156,4.375e-4 c 0.175781,0 0.398438,0 0.574219,0.03515625 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="AVX"
+       transform="matrix(1.3333333,0,0,1.3333333,272.58,95.530667)" />
+    <path
+       id="path77"
+       d="m 238.11302,96.37898 31.87468,0.66393"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path78"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.3329867,-0.02774667,-0.02774667,-1.3329867,364.36227,90.380133)" />
+    <g
+       id="g82"
+       clip-path="url(#clipPath83)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,166.86044)"
+         style="fill:url(#linearGradient82);stroke:none"
+         id="path82" />
+    </g>
+    <path
+       id="path84"
+       d="m 181.4194,51.02391 v 22.67752 h 56.69362 V 51.02391 Z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path85"
+       d="m 1.8974688,-4.4304766 c 0,-0.75 0.023437,-1.4648437 0.1054687,-1.9921875 l -0.011719,-0.035156 c -0.1171875,0.023437 -0.3984375,0.035156 -0.515625,0.035156 -0.1171875,0 -0.3867188,-0.011719 -0.51562505,-0.035156 L 0.94825,-6.4226641 c 0.0820313,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.9921875375 L 0.95996875,0.03436719 c 0,0 0.17578125,-0.0351562525 0.51562505,-0.0351562525 h 2.71875 c 0.2109375,0 0.4335937,0.0117187525 0.5976562,0.0351562525 L 4.8154375,-7.890625e-4 C 4.8037188,-0.14141406 4.792,-0.34063281 4.792,-0.43438281 c 0,-0.10546875 0.011719,-0.26953125 0.023437,-0.3515625 L 4.792,-0.83282031 c 0,0 -1.4765625,0.22265625 -2.8125,0.22265625 C 1.909188,-0.84453906 1.897469,-1.8289141 1.897469,-1.9929766 Z m 4.265625,0 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.9921875375 L 6.0927813,0.03436719 C 6.2099688,0.01092969 6.4795,-7.890625e-4 6.5966875,-7.890625e-4 c 0.1171875,0 0.3867188,0.0117187525 0.5039063,0.0351562525 L 7.1240313,-7.890625e-4 C 7.042,-0.56328906 7.0302813,-1.2429766 7.0302813,-1.9929766 v -2.4375 c 0,-0.75 0.011719,-1.4648437 0.09375,-1.9921875 l -0.023437,-0.035156 c -0.1171875,0.023437 -0.3867188,0.035156 -0.5039063,0.035156 -0.1171875,0 -0.3867187,-0.011719 -0.5039062,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.093749,1.9921875 z m 3.7939453,-1.1367187 c 0,-0.328125 0.09375,-0.4570313 0.9023439,-0.4570313 0.550781,0 1.265625,0.2109375 1.265625,1.2304688 0,0.84375 -0.574219,1.1015625 -1.394531,1.1015625 H 9.9570391 Z m 0,2.296875 H 10.88282 c 1.160157,0 1.710938,0.7265625 1.710938,1.6289062 0,0.67968754 -0.246094,1.24218754 -1.675781,1.24218754 -0.703125,0 -0.9609379,-0.15234375 -0.9609379,-0.48046875 z M 9.5234453,-6.4226641 c -0.3398437,0 -0.5039062,-0.035156 -0.5039062,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.9921875375 L 9.0078203,0.03436719 c 0,0 0.1640625,-0.0351562525 0.515625,-0.0351562525 0.6210937,0 0.6914067,0.0351562525 1.5820317,0.0351562525 1.980468,0 2.472656,-1.0078125 2.472656,-1.86328129 0,-0.9609375 -0.644531,-1.5117187 -1.511719,-1.7578125 0.503906,-0.2460937 0.925781,-0.7734375 0.925781,-1.3242187 0,-0.6679688 -0.351562,-1.546875 -2.203125,-1.546875 -0.351562,0 -0.773437,0.035156 -1.2656247,0.035156 z M 19.809578,0.03436719 C 19.012703,-1.1023516 18.356453,-2.1101641 17.489266,-3.4343828 c 0.925781,-1.3359375 1.722656,-2.4140625 2.167968,-3.0234375 -0.117187,0.035156 -0.316406,0.035156 -0.421875,0.035156 -0.105468,0 -0.304687,0 -0.421875,-0.035156 -0.539062,1.0078125 -0.785156,1.3710937 -1.605468,2.5664062 -0.65625,-0.9960937 -1.324219,-1.96875 -1.605469,-2.5664062 -0.175781,0.035156 -0.398438,0.035156 -0.574219,0.035156 -0.164062,0 -0.386719,0 -0.550781,-0.035156 l 2.144531,3.2578125 -2.191406,3.23437499 c 0.105469,-0.0351562525 0.304687,-0.0351562525 0.410156,-0.0351562525 0.117188,0 0.316406,0 0.421875,0.0351562525 0.527344,-0.984375 1.101563,-1.92187499 1.664063,-2.76562499 0.632812,0.9609375 1.207031,1.76953124 1.734375,2.76562499 0.175781,-0.0351562525 0.398437,-0.0351562525 0.574218,-0.0351562525 0.175782,0 0.398438,0 0.574219,0.0351562525 z m 2.589844,-0.375 c -0.5625,0 -1.277344,-0.4921875 -1.582031,-1.07812499 l -0.105469,0.011719 c -0.03516,0.3632813 -0.152344,0.72656254 -0.222656,1.03125004 l 0.01172,0.0234375 c 0,0 0.644532,0.45703125 1.816407,0.45703125 1.21875,0 2.167968,-0.75 2.167968,-1.91015629 0,-1.1484375 -0.972656,-1.6875 -1.78125,-1.9921875 -0.503906,-0.1875 -1.21875,-0.4804687 -1.21875,-1.3007812 0,-0.3632813 0.199219,-0.7617188 0.457032,-0.9023438 0.175781,-0.082031 0.386718,-0.1171875 0.609375,-0.1171875 0.550781,0 1.089843,0.4335938 1.359375,1.0429688 l 0.09375,-0.011719 c 0.04687,-0.3515625 0.140625,-0.6914062 0.234375,-0.9960937 l -0.02344,-0.023437 c 0,0 -0.410156,-0.4453125 -1.582031,-0.4453125 -0.269531,0 -0.5625,0.046875 -0.855469,0.1640625 -0.585937,0.2460937 -1.089844,0.8320312 -1.089844,1.5585937 0,1.0429688 0.878907,1.5351563 1.710938,1.875 0.644531,0.2695313 1.160156,0.5976563 1.160156,1.453125 0,0.73828129 -0.574219,1.16015629 -1.160156,1.16015569 z m 4.549805,-6.25781249 c -0.351563,2.015625 -0.890625,4.7695312 -1.289063,6.63281249 0.117188,-0.0351562525 0.210938,-0.0351562525 0.328125,-0.0351562525 0.117188,0 0.199219,0 0.304688,0.0351562525 0.152343,-1.05468749 0.492187,-3.26953129 0.726562,-4.78124999 h 0.04687 c 0.75,1.59375 1.453125,3.1992187 2.097656,4.7460937375 h 0.1875 C 30.066414,-1.6296953 30.769539,-3.1531328 31.566414,-4.7117266 l 0.02344,0.011719 c 0.210937,1.5585937 0.410156,3.0820312 0.5625,4.73437499 0.152343,-0.0351562525 0.339843,-0.0351562525 0.480468,-0.0351562525 0.152344,0 0.410157,0 0.5625,0.0351562525 -0.363281,-2.13281249 -0.65625,-4.20703129 -0.9375,-6.63281249 h -0.246093 l -2.425782,4.9453125 h -0.07031 c -0.796875,-1.640625 -1.511719,-3.234375 -2.238281,-4.9453125 z m 8.824218,0 c -0.351562,2.015625 -0.890625,4.7695312 -1.289062,6.63281249 0.117187,-0.0351562525 0.210937,-0.0351562525 0.328125,-0.0351562525 0.117187,0 0.199219,0 0.304687,0.0351562525 0.152344,-1.05468749 0.492188,-3.26953129 0.726563,-4.78124999 h 0.04687 c 0.75,1.59375 1.453125,3.1992187 2.097656,4.7460937375 h 0.1875 C 38.890633,-1.6296953 39.593758,-3.1531328 40.390633,-4.7117266 l 0.02344,0.011719 c 0.210938,1.5585937 0.410157,3.0820312 0.5625,4.73437499 0.152344,-0.0351562525 0.339844,-0.0351562525 0.480469,-0.0351562525 0.152344,0 0.410156,0 0.5625,0.0351562525 -0.363281,-2.13281249 -0.65625,-4.20703129 -0.9375,-6.63281249 h -0.246094 l -2.425781,4.9453125 h -0.07031 c -0.796875,-1.640625 -1.511719,-3.234375 -2.238282,-4.9453125 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="LIBXSMM"
+       transform="matrix(1.3333333,0,0,1.3333333,255.61067,140.98933)" />
+    <path
+       id="path86"
+       d="m 238.11302,62.36288 32.42081,29.04401"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path87"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.99304,-0.88962667,-0.88962667,-0.99304,365.09041,97.894827)" />
+    <g
+       id="g91"
+       clip-path="url(#clipPath92)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,212.21524)"
+         style="fill:url(#linearGradient91);stroke:none"
+         id="path91" />
+    </g>
+    <path
+       id="path93"
+       d="m 181.4194,17.00783 v 22.67752 h 56.69362 V 17.00783 Z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path94"
+       d="m 3.7843047,-6.5520703 c -1.6875,0 -3.26953126,1.453125 -3.26953126,3.4101562 0,1.7226563 0.98437496,3.24609379 3.12890626,3.24609379 0.9140625,0 1.78125,-0.28125 2.4492187,-1.08984375 C 6.0811797,-1.1028516 6.0694609,-1.3255078 6.0343047,-1.4192578 l -0.070312,-0.023437 c -0.703125,0.77343749 -1.3007813,1.03124999 -2.2265625,1.03124999 -1.265625,0 -2.1914063,-1.35937499 -2.1914063,-2.91796879 0,-2.015625 1.2773438,-2.7539062 2.0976563,-2.7539062 0.9023437,0 1.6289062,0.3515625 2.0390625,1.171875 l 0.1171875,-0.011719 c 0.023437,-0.5039062 0.070312,-0.6796875 0.1757812,-1.1015625 l -0.023437,-0.035156 c 0,0 -0.9492187,-0.4921875 -2.1679687,-0.4921875 z m 8.3730473,2.1210937 v 1.6757813 c 0,1.1132812 -0.164063,2.29687499 -1.851563,2.29687499 -1.6640624,0 -1.6640624,-1.68749999 -1.6640624,-2.22656249 v -1.7460938 c 0,-0.75 0.011719,-1.4648437 0.09375,-1.9921875 l -0.023437,-0.035156 c -0.1171875,0.023437 -0.3867188,0.035156 -0.5039063,0.035156 -0.1171875,0 -0.3867187,-0.011719 -0.5039062,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.0742188 c 0,2.03906249 1.4296875,2.46093749 2.3203129,2.46093749 2.050781,0 2.601562,-1.26562499 2.601562,-3.04687499 v -1.4882813 c 0,-0.75 0.02344,-1.4648437 0.105469,-1.9921875 l -0.02344,-0.035156 c -0.117187,0.023437 -0.222656,0.035156 -0.351562,0.035156 -0.117188,0 -0.222656,-0.011719 -0.339844,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 3.380859,3.60937504 V -5.6145703 c 0,-0.3515625 0.339844,-0.3984375 0.832031,-0.3984375 2.15625,0 2.800781,1.6523437 2.800781,3.1875 0,2.00390624 -1.148437,2.40234374 -2.554687,2.40234374 -0.972656,0 -1.078125,-0.0820312 -1.078125,-0.3984375 z M 15.104617,-6.4231641 c -0.339844,0 -0.503906,-0.035156 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312504 -0.09375,1.99218754 l 0.01172,0.03515625 c 0,0 0.164063,-0.03515625 0.515625,-0.03515625 0.703125,0 0.796875,0.0234375 1.828125,0.0234375 1.394531,0 3.257813,-0.64453125 3.257813,-3.09375004 0,-1.8515625 -1.535157,-3.3867187 -3.445313,-3.3867187 -0.632812,0 -1.007812,0.035156 -1.640625,0.035156 z m 9.372071,3.6796875 c -0.328125,0.011719 -0.75,0.023437 -1.03125,0.023437 -0.292969,0 -0.832032,-0.011719 -1.054688,-0.035156 l 1.054688,-2.4375 h 0.02344 c 0.410156,0.9375 0.738281,1.7578125 1.007813,2.4492187 z m -2.273438,0.46875 c 0.246094,-0.023437 0.9375,-0.023437 1.289063,-0.023437 0.375,0 0.925781,0 1.160156,0.023437 0.492187,1.31250004 0.714844,2.08593754 0.785156,2.30859379 0.164063,-0.03515625 0.328125,-0.03515625 0.492188,-0.03515625 0.175781,0 0.433593,0 0.597656,0.03515625 C 26.035281,-0.97394531 24.711063,-4.3723828 23.797,-6.5520703 h -0.28125 c -0.949219,2.203125 -1.933594,4.3828125 -2.929687,6.58593749 0.117187,-0.03515625 0.234375,-0.03515625 0.339843,-0.03515625 0.117188,0 0.328125,0 0.457032,0.03515625 0.152343,-0.609375 0.46875,-1.42968749 0.820312,-2.30859379 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="CUDA"
+       transform="matrix(1.3333333,0,0,1.3333333,266.27067,186.24)" />
+    <path
+       id="path95"
+       d="m 238.11302,28.3468 31.87468,0.66391"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path96"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.3329867,-0.02774667,-0.02774667,-1.3329867,364.36227,181.08973)" />
+    <g
+       id="g100"
+       clip-path="url(#clipPath101)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,257.56997)"
+         style="fill:url(#linearGradient100);stroke:none"
+         id="path100" />
+    </g>
+    <path
+       id="path102"
+       d="M 181.4194,-17.00824 V 5.66927 h 56.69362 v -22.67751 z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path103"
+       d="m 5.3424141,-4.4302187 v 0.796875 H 1.8971016 v -0.796875 c 0,-0.75 0.011719,-1.4648438 0.09375,-1.9921875 l -0.011719,-0.035156 c -0.1171875,0.023437 -0.3984375,0.035156 -0.515625,0.035156 -0.1171875,0 -0.3867187,-0.011719 -0.50390624,-0.035156 l -0.0234375,0.035156 c 0.0820312,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.011719,1.45312495 -0.09375,1.99218745 L 0.94788281,0.034625 C 1.0650703,0.0111875 1.3463203,-5.3125e-4 1.4635078,-5.3125e-4 c 0.1171875,0 0.3867188,0.01171875 0.5039063,0.03515625 l 0.023437,-0.03515625 c -0.082031,-0.5625 -0.09375,-1.24218745 -0.09375,-1.99218745 V -3.19975 h 3.4453125 v 1.2070313 c 0,0.75 -0.023437,1.45312495 -0.1054688,1.99218745 L 5.2486641,0.034625 c 0.1171875,-0.0234375 0.3984375,-0.03515625 0.515625,-0.03515625 0.1171875,0 0.3867187,0.01171875 0.515625,0.03515625 l 0.011719,-0.03515625 c -0.082031,-0.5625 -0.09375,-1.24218745 -0.09375,-1.99218745 v -2.4375 c 0,-0.75 0.011719,-1.4648438 0.09375,-1.9921875 l -0.011719,-0.035156 c -0.1171875,0.023437 -0.3984375,0.035156 -0.515625,0.035156 -0.1171875,0 -0.3867188,-0.011719 -0.5039063,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.1054688,1.2421875 0.1054683,1.9921875 z m 2.9208984,0 v 2.4375 c 0,0.75 -0.011719,1.45312495 -0.09375,1.99218745 L 8.193,0.034625 C 8.3101875,0.0111875 8.5797187,-5.3125e-4 8.6969062,-5.3125e-4 c 0.1171875,0 0.3867188,0.01171875 0.5039063,0.03515625 L 9.22425,-5.3125e-4 C 9.1422187,-0.56303125 9.1305,-1.2427187 9.1305,-1.9927187 v -2.4375 c 0,-0.75 0.011719,-1.4648438 0.09375,-1.9921875 l -0.023437,-0.035156 c -0.1171875,0.023437 -0.3867188,0.035156 -0.5039063,0.035156 -0.1171875,0 -0.3867187,-0.011719 -0.5039062,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 3.7968745,-1.1953125 c 0,-0.2929688 0.175782,-0.4453125 0.890625,-0.4453125 0.691407,0 1.417969,0.1875 1.417969,1.3828125 0,1.1367187 -0.5625,1.5117187 -1.523437,1.5117187 -0.257813,0 -0.65625,-0.035156 -0.785157,-0.1054687 z M 11.193,-4.4302187 v 2.4375 c 0,0.75 -0.01172,1.45312495 -0.09375,1.99218745 L 11.12269,0.034625 c 0.117188,-0.0234375 0.386719,-0.03515625 0.503907,-0.03515625 0.117187,0 0.386718,0.01171875 0.503906,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.24218745 -0.09375,-1.99218745 v -0.8554688 c 0.210938,0.070312 0.480469,0.09375 0.832032,0.09375 1.816406,0 2.4375,-1.1367187 2.4375,-1.9921875 0,-0.7382812 -0.46875,-1.7460937 -2.308594,-1.7460937 -0.257813,0 -1.101563,0.070312 -1.394531,0.070312 -0.117188,0 -0.386719,-0.011719 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="HIP"
+       transform="matrix(1.3333333,0,0,1.3333333,273.576,231.64133)" />
+    <path
+       id="path104"
+       d="m 238.11302,-5.6697 31.87468,0.66392"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path105"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.3329867,-0.02774667,-0.02774667,-1.3329867,364.36227,226.44507)" />
+    <g
+       id="g109"
+       clip-path="url(#clipPath110)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,302.92473)"
+         style="fill:url(#linearGradient109);stroke:none"
+         id="path109" />
+    </g>
+    <path
+       id="path111"
+       d="m 181.4194,-51.02432 v 22.67752 h 56.69362 v -22.67752 z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path112"
+       d="m 2.4131094,-0.33892969 c -0.5625,0 -1.2773438,-0.4921875 -1.58203127,-1.07812501 l -0.10546875,0.011719 c -0.0351563,0.3632812 -0.15234375,0.72656246 -0.22265625,1.03124996 l 0.0117188,0.0234375 c 0,0 0.64453127,0.45703125 1.81640627,0.45703125 1.21875,0 2.1679688,-0.75 2.1679688,-1.91015621 0,-1.1484375 -0.9726563,-1.6875 -1.78125,-1.9921875 -0.5039063,-0.1875 -1.21875,-0.4804688 -1.21875,-1.3007813 0,-0.3632812 0.1992187,-0.7617187 0.4570312,-0.9023437 0.1757813,-0.082031 0.3867188,-0.1171875 0.609375,-0.1171875 0.5507813,0 1.0898438,0.4335937 1.359375,1.0429687 l 0.09375,-0.011719 c 0.046875,-0.3515625 0.140625,-0.6914063 0.234375,-0.9960938 l -0.023437,-0.023437 c 0,0 -0.4101561,-0.4453125 -1.5820311,-0.4453125 -0.2695314,0 -0.5625,0.046875 -0.8554689,0.1640625 -0.5859375,0.2460938 -1.08984367,0.8320313 -1.08984367,1.5585938 0,1.0429687 0.87890617,1.5351562 1.71093757,1.875 0.6445311,0.2695312 1.1601562,0.5976562 1.1601562,1.453125 0,0.73828121 -0.5742187,1.16015621 -1.1601568,1.16015581 z M 8.3574453,-2.8350234 c 0.4101563,-0.796875 1.1015625,-1.78125 2.2968747,-3.6210938 -0.128906,0.023437 -0.292968,0.035156 -0.410156,0.035156 -0.128906,0 -0.3046874,-0.011719 -0.4218749,-0.035156 -0.4921875,1.0429688 -1.1601563,2.1679688 -1.7460938,3.0703125 -0.6210937,-1.0898437 -1.2070312,-2.0273437 -1.734375,-3.0703125 -0.1757812,0.023437 -0.3984375,0.035156 -0.5742187,0.035156 -0.1640625,0 -0.3984375,-0.011719 -0.5625,-0.035156 0.3984375,0.5976563 1.921875,3.0703125 2.296875,3.7148438 0,0.984375 -0.023437,2.22656246 -0.070312,2.77734371 0.1171875,-0.0234375 0.375,-0.0351562475 0.5039062,-0.0351562475 0.1171875,0 0.375,0.0117187475 0.4921875,0.0351562475 -0.035156,-0.48046875 -0.058594,-1.88671871 -0.070313,-2.87109371 z m 5.8974607,-3.7148438 c -1.6875,0 -3.269531,1.453125 -3.269531,3.4101563 0,1.7226562 0.984375,3.24609371 3.128906,3.24609371 0.914063,0 1.78125,-0.28125 2.449219,-1.08984375 C 16.55178,-1.1006484 16.54006,-1.3233047 16.50491,-1.4170547 l -0.07031,-0.023437 c -0.703125,0.77343751 -1.300781,1.03125001 -2.226563,1.03125001 -1.265625,0 -2.191406,-1.35937501 -2.191406,-2.91796871 0,-2.015625 1.277344,-2.7539063 2.097656,-2.7539063 0.902344,0 1.628907,0.3515625 2.039063,1.171875 l 0.117187,-0.011719 c 0.02344,-0.5039063 0.07031,-0.6796875 0.175782,-1.1015625 l -0.02344,-0.035156 c 0,0 -0.949219,-0.4921875 -2.167969,-0.4921875 z m 4.860352,2.1210938 c 0,-0.75 0.02344,-1.4648438 0.105469,-1.9921875 l -0.01172,-0.035156 c -0.117188,0.023437 -0.398438,0.035156 -0.515625,0.035156 -0.117188,0 -0.386719,-0.011719 -0.515625,-0.035156 l -0.01172,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312496 -0.09375,1.9921874625 l 0.01172,0.0351562475 c 0,0 0.175781,-0.0351562475 0.515625,-0.0351562475 h 2.71875 c 0.210937,0 0.433594,0.0117187475 0.597656,0.0351562475 L 22.033227,9.140625e-4 C 22.021508,-0.13971094 22.009789,-0.33892969 22.009789,-0.43267969 c 0,-0.10546875 0.01172,-0.26953125 0.02344,-0.3515625 l -0.02344,-0.046875 c 0,0 -1.476562,0.22265625 -2.8125,0.22265625 -0.07031,-0.234375 -0.08203,-1.21874996 -0.08203,-1.38281246 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="SYCL"
+       transform="matrix(1.3333333,0,0,1.3333333,269.16533,276.948)" />
+    <path
+       id="path113"
+       d="m 238.11302,-39.68576 31.87468,0.66391"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path114"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(1.3329867,-0.02774667,-0.02774667,-1.3329867,364.36227,271.79983)" />
+    <g
+       id="g118"
+       clip-path="url(#clipPath119)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.512,0,0,-0.6048,208.46693,348.27952)"
+         style="fill:url(#linearGradient118);stroke:none"
+         id="path118" />
+    </g>
+    <path
+       id="path120"
+       d="m 181.4194,-85.0404 v 22.67752 h 56.69362 V -85.0404 Z"
+       style="fill:none;stroke:#333333;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path121"
+       d="M 1.9216328,-6.5969687 C 1.5700703,-4.5813437 1.0310078,-1.8274375 0.63257031,0.03584375 0.74975781,6.875e-4 0.84350781,6.875e-4 0.96069531,6.875e-4 c 0.11718749,0 0.19921879,0 0.30468749,0.03515625 C 1.4177266,-1.0188437 1.7575703,-3.2336875 1.9919453,-4.7454062 h 0.046875 c 0.75,1.59375 1.453125,3.1992187 2.0976563,4.7460937 h 0.1875 C 5.0388203,-1.6282187 5.7419453,-3.1516562 6.5388203,-4.71025 l 0.023437,0.011719 C 6.7731953,-3.1399375 6.9724141,-1.6165 7.1247578,0.03584375 7.2771016,6.875e-4 7.4646016,6.875e-4 7.6052266,6.875e-4 c 0.1523437,0 0.4101562,0 0.5625,0.03515625 -0.3632813,-2.13281245 -0.65625,-4.20703125 -0.9375,-6.63281245 H 6.9841328 L 4.5583516,-1.6516562 H 4.4880391 C 3.6911641,-3.2922812 2.9763203,-4.8860312 2.2497578,-6.5969687 Z M 12.881594,-2.7415 c -0.328125,0.011719 -0.75,0.023437 -1.03125,0.023437 -0.292969,0 -0.832032,-0.011719 -1.054688,-0.035156 l 1.054688,-2.4375 h 0.02344 c 0.410156,0.9375 0.738281,1.7578125 1.007813,2.4492187 z m -2.273438,0.46875 c 0.246094,-0.023437 0.9375,-0.023437 1.289063,-0.023437 0.375,0 0.925781,0 1.160156,0.023437 0.492187,1.3125 0.714844,2.0859375 0.785156,2.30859375 C 14.006594,6.875e-4 14.170656,6.875e-4 14.334719,6.875e-4 c 0.175781,0 0.433593,0 0.597656,0.03515625 -0.492188,-1.0078125 -1.816406,-4.40624995 -2.730469,-6.58593745 h -0.28125 C 10.971437,-4.3469687 9.9870625,-2.1672812 8.9909687,0.03584375 9.1081562,6.875e-4 9.2253437,6.875e-4 9.3308125,6.875e-4 c 0.1171875,0 0.328125,0 0.4570312,0.03515625 0.1523438,-0.609375 0.4687503,-1.42968745 0.8203123,-2.30859375 z m 9.583008,1.1132813 v 0.44531245 c -0.339844,0.31640625 -0.925781,0.375 -1.464844,0.375 -1.769531,0 -2.425781,-1.60546875 -2.425781,-2.91796875 0,-1.7460937 0.960938,-2.859375 2.308594,-2.859375 0.890625,0 1.664062,0.5039063 2.109375,1.2539063 l 0.117187,-0.011719 c 0.03516,-0.5039062 0.09375,-0.7734375 0.199219,-1.1484375 l -0.01172,-0.035156 c 0,0 -1.054687,-0.4921875 -2.273437,-0.4921875 -1.710938,0 -3.480469,1.2070312 -3.480469,3.3984375 0,1.7226562 1.230469,3.25781245 3.234375,3.25781245 1.148438,0 1.96875,-0.33984375 2.648438,-0.90234375 v -0.0351563 c -0.09375,-0.09375 -0.105469,-0.3632813 -0.105469,-0.46875 v -0.058594 c 0,-0.75 0.02344,-1.171875 0.105469,-1.6992187 l -0.01172,-0.035156 c 0,0 -0.175781,0.035156 -0.515625,0.035156 -0.339844,0 -0.515625,-0.035156 -0.515625,-0.035156 l -0.01172,0.035156 c 0.07031,0.5625 0.09375,1.1484375 0.09375,1.8984375 z m 3.629883,-5.4375 c -0.351563,2.015625 -0.890625,4.7695312 -1.289063,6.63281245 C 22.649172,6.875e-4 22.742922,6.875e-4 22.860109,6.875e-4 c 0.117188,0 0.199219,0 0.304688,0.03515625 0.152344,-1.05468745 0.492187,-3.26953125 0.726562,-4.78124995 h 0.04687 c 0.75,1.59375 1.453125,3.1992187 2.097657,4.7460937 h 0.1875 c 0.714843,-1.6289062 1.417968,-3.1523437 2.214843,-4.7109375 l 0.02344,0.011719 c 0.210937,1.5585937 0.410156,3.0820312 0.5625,4.73437495 C 29.176516,6.875e-4 29.364016,6.875e-4 29.504641,6.875e-4 c 0.152343,0 0.410156,0 0.5625,0.03515625 -0.363282,-2.13281245 -0.65625,-4.20703125 -0.9375,-6.63281245 h -0.246094 l -2.425781,4.9453125 h -0.07031 c -0.796875,-1.640625 -1.511719,-3.234375 -2.238281,-4.9453125 z M 34.778078,-2.7415 c -0.328125,0.011719 -0.75,0.023437 -1.03125,0.023437 -0.292969,0 -0.832031,-0.011719 -1.054687,-0.035156 l 1.054687,-2.4375 h 0.02344 c 0.410156,0.9375 0.738281,1.7578125 1.007812,2.4492187 z m -2.273437,0.46875 c 0.246093,-0.023437 0.9375,-0.023437 1.289062,-0.023437 0.375,0 0.925781,0 1.160156,0.023437 0.492188,1.3125 0.714844,2.0859375 0.785157,2.30859375 C 35.903078,6.875e-4 36.067141,6.875e-4 36.231203,6.875e-4 c 0.175781,0 0.433594,0 0.597656,0.03515625 -0.492187,-1.0078125 -1.816406,-4.40624995 -2.730468,-6.58593745 h -0.28125 c -0.949219,2.203125 -1.933594,4.3828125 -2.929688,6.58593745 C 31.004641,6.875e-4 31.121828,6.875e-4 31.227297,6.875e-4 c 0.117187,0 0.328125,0 0.457031,0.03515625 0.152344,-0.609375 0.46875,-1.42968745 0.820313,-2.30859375 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="MAGMA"
+       transform="matrix(1.3333333,0,0,1.3333333,259.40267,322.40533)" />
+    <path
+       id="path122"
+       d="m 238.11302,-73.70186 33.33023,98.60217"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path123"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.42693333,-1.2630667,-1.2630667,-0.42693333,366.30299,186.57025)" />
+    <path
+       id="path124"
+       d="m 238.11302,-73.70186 33.04201,64.7076"
+       style="fill:none;stroke:#000000;stroke-width:0.49814;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path125"
+       d="M 2.14195,0 -1.28517,1.71356 0,0 -1.28517,-1.71356"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       transform="matrix(0.60634667,-1.18744,-1.18744,-0.60634667,365.91871,231.76304)" />
+    <path
+       id="path126"
+       d="m 6.4102891,-5.3204844 v 0.9609375 H 2.2735703 v -0.9609375 c 0,-0.890625 0.023437,-1.7578125 0.1171875,-2.390625 l -0.011719,-0.035156 c -0.140625,0.023437 -0.4804688,0.035156 -0.6210938,0.035156 -0.140625,0 -0.46875,-0.011719 -0.609375,-0.035156 l -0.023437,0.035156 c 0.09375,0.6796875 0.1171875,1.5 0.1171875,2.390625 v 2.9296875 c 0,0.890625 -0.023437,1.74609377 -0.1171875,2.390625025 L 1.136852,0.03498437 c 0.140625,-0.0234375 0.4804687,-0.035156245 0.6210937,-0.035156245 0.140625,0 0.46875,0.011718745 0.609375,0.035156245 L 2.3907578,-1.71875e-4 C 2.2970078,-0.67985938 2.2735703,-1.5001719 2.2735703,-2.3907969 v -1.4414062 h 4.1367188 v 1.4414062 c 0,0.890625 -0.023437,1.74609377 -0.1171875,2.390625025 l 0.011719,0.035156245 c 0.140625,-0.0234375 0.46875,-0.035156245 0.6210938,-0.035156245 0.140625,0 0.4570312,0.011718745 0.609375,0.035156245 L 7.5587266,-1.71875e-4 C 7.4532578,-0.67985938 7.4298203,-1.5001719 7.4298203,-2.3907969 v -2.9296875 c 0,-0.890625 0.023437,-1.7578125 0.1289063,-2.390625 l -0.011719,-0.035156 c -0.1523437,0.023437 -0.4804687,0.035156 -0.6210937,0.035156 -0.1523438,0 -0.46875,-0.011719 -0.609375,-0.035156 l -0.023437,0.035156 c 0.09375,0.6796875 0.1171875,1.5 0.1171872,2.390625 z m 5.9736329,2.625 -0.04687,1.4296875 c 0,0.1523438 -0.07031,0.234375 -0.164063,0.30468752 -0.339843,0.2578125 -0.738281,0.46875 -1.101562,0.46875 -0.539063,0 -0.878906,-0.3515625 -0.878906,-0.72656252 0,-0.5390625 0.246093,-0.9492187 1.183593,-1.1953125 z m 0,2.12109377 c 0.128906,0.46875 0.492187,0.69140625 0.9375,0.69140625 0.292969,0 0.667969,-0.0703125 0.9375,-0.36328125 l -0.08203,-0.29296875 c -0.128907,0.0351563 -0.234375,0.0585938 -0.316407,0.0585938 -0.128906,0 -0.28125,-0.0234375 -0.363281,-0.09375 -0.117187,-0.10546875 -0.1875,-0.3984375 -0.1875,-0.93749992 0,-0.3398438 0.03516,-1.6640626 0.03516,-1.8046876 0,-1.6054687 -1.066406,-1.9335937 -1.96875,-1.9335937 -0.914062,0 -1.4648434,0.4453125 -1.7578121,0.6914062 l -0.035156,0.046875 0.1992188,0.8085937 0.1523437,0.011719 c 0.3398436,-0.5390625 0.7382806,-1.0195313 1.3124996,-1.0195313 0.433594,0 1.148438,0.058594 1.148438,1.40625 0,0.09375 -0.04687,0.140625 -0.08203,0.1523438 l -1.113281,0.2460937 c -1.2187496,0.28125 -1.9804683,0.9257813 -1.9804683,1.734375 0,0.89062507 0.609375,1.28906257 1.4882813,1.28906257 0.667968,0 0.996093,-0.15234375 1.628906,-0.69140625 z m 4.037109,-3.42187497 v -1.0898438 c 0,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269532,0.058594 -0.703125,0.070312 -0.960938,0.035156 l -0.02344,0.035156 c 0.08203,0.5273438 0.105468,1.4296875 0.105468,2.3203125 v 0.65625 c 0,0.9023438 -0.02344,1.52343752 -0.105468,2.156250025 l 0.02344,0.035156245 c 0.140625,-0.0234375 0.410156,-0.035156245 0.5625,-0.035156245 0.140625,0 0.410156,0.011718745 0.550781,0.035156245 l 0.03516,-0.035156245 C 16.43275,-0.67985938 16.421031,-1.2423594 16.421031,-2.1564219 v -0.6210937 c 0,-0.4921875 0.152344,-0.7851563 0.410156,-1.1835938 0.164063,-0.2578125 0.445313,-0.4101562 0.667969,-0.4101562 0.246094,0 0.46875,0.023437 0.621094,0.1640625 l 0.09375,-0.023437 0.246094,-0.890625 -0.04687,-0.046875 c -0.210938,-0.058594 -0.222657,-0.082031 -0.433594,-0.082031 -0.644531,0 -0.996094,0.375 -1.523438,1.2773438 z m 6.46875,2.6484375 c -0.445312,0.56249997 -0.984375,0.84374997 -1.464844,0.84374997 -0.644531,0 -1.195312,-0.64453127 -1.195312,-2.13281247 0,-1.7929688 0.9375,-2.1328125 1.523437,-2.1328125 0.5625,0 0.855469,0.234375 1.136719,0.6914062 z m 0,0.65624997 h 0.02344 l 0.05859,0.691406255 c 0,0.023437495 0.03516,0.035156245 0.117188,0.035156245 0.152344,-0.01171875 0.234375,-0.035156245 0.398437,-0.035156245 0.152344,0 0.386719,0.011718745 0.539063,0.035156245 l 0.02344,-0.035156245 C 23.956187,-0.51579688 23.850719,-1.3947031 23.850719,-2.2970469 v -3.6796875 c 0,-0.8789062 0.04687,-1.5117187 0.105468,-2.25 0,-0.082031 -0.03516,-0.1171875 -0.105468,-0.1171875 -0.304688,0.1171875 -0.5625,0.1992188 -1.042969,0.234375 l -0.02344,0.035156 c 0.08203,0.515625 0.105469,1.40625 0.105469,2.3085937 v 0.7851563 c -0.269531,-0.1640625 -0.761719,-0.2695313 -0.996094,-0.2695313 -1.570312,0 -2.707031,1.078125 -2.707031,2.7070313 0,1.4648437 0.867188,2.66015622 2.167969,2.66015622 0.574219,0 1.113281,-0.24609375 1.535156,-0.80859375 z m 4.473633,0.73828125 c 0.117188,-0.03515625 0.28125,-0.03515625 0.398438,-0.03515625 0.117187,0 0.316406,0 0.421875,0.03515625 0.328125,-0.9140625 0.679687,-1.89843752 1.054687,-2.83593752 0.363281,0.9375 0.738281,1.91015627 1.089844,2.82421877 0.117187,-0.035156245 0.1875,-0.035156245 0.316406,-0.035156245 0.117188,0 0.328125,0 0.433594,0.035156245 0.585937,-1.40624997 1.546875,-3.70312497 2.226562,-5.20312497 -0.105468,0.035156 -0.398437,0.035156 -0.515625,0.035156 -0.117187,0 -0.292968,0 -0.398437,-0.035156 -0.445313,1.4179687 -0.984375,2.9414062 -1.488281,4.0898437 h -0.09375 c -0.492188,-1.3125 -0.867188,-2.8242187 -1.136719,-4.0898437 -0.152344,0.035156 -0.433594,0.035156 -0.597656,0.035156 -0.1875,0 -0.503907,0 -0.691407,-0.035156 0.152344,0.4921875 0.339844,1.0195312 0.550782,1.5703125 -0.316407,0.890625 -0.644532,1.7929687 -0.996094,2.53125 h -0.07031 c -0.597656,-1.359375 -0.996093,-2.7070313 -1.417968,-4.0898438 -0.164063,0.035156 -0.433594,0.035156 -0.597657,0.035156 -0.199218,0 -0.503906,0 -0.703125,-0.035156 0.808594,1.7578125 1.546875,3.4570313 2.214844,5.20312502 z m 9.884766,-2.74218752 -0.04687,1.4296875 c 0,0.1523438 -0.07031,0.234375 -0.164063,0.30468752 -0.339844,0.2578125 -0.738281,0.46875 -1.101562,0.46875 -0.539063,0 -0.878907,-0.3515625 -0.878907,-0.72656252 0,-0.5390625 0.246094,-0.9492187 1.183594,-1.1953125 z m 0,2.12109377 c 0.128906,0.46875 0.492187,0.69140625 0.9375,0.69140625 0.292968,0 0.667968,-0.0703125 0.9375,-0.36328125 l -0.08203,-0.29296875 c -0.128906,0.0351563 -0.234375,0.0585938 -0.316406,0.0585938 -0.128906,0 -0.28125,-0.0234375 -0.363281,-0.09375 -0.117188,-0.10546875 -0.1875,-0.3984375 -0.1875,-0.93749992 0,-0.3398438 0.03516,-1.6640626 0.03516,-1.8046876 0,-1.6054687 -1.066406,-1.9335937 -1.96875,-1.9335937 -0.914062,0 -1.464844,0.4453125 -1.757812,0.6914062 l -0.03516,0.046875 0.199219,0.8085937 0.152344,0.011719 c 0.339844,-0.5390625 0.738281,-1.0195313 1.3125,-1.0195313 0.433594,0 1.148437,0.058594 1.148437,1.40625 0,0.09375 -0.04687,0.140625 -0.08203,0.1523438 l -1.113281,0.2460937 c -1.21875,0.28125 -1.980469,0.9257813 -1.980469,1.734375 0,0.89062507 0.609375,1.28906257 1.488281,1.28906257 0.667969,0 0.996094,-0.15234375 1.628907,-0.69140625 z m 4.040039,-3.42187497 v -1.0898438 c 0,-0.082031 -0.02344,-0.1171875 -0.07031,-0.1171875 -0.269531,0.058594 -0.703125,0.070312 -0.960937,0.035156 l -0.02344,0.035156 c 0.08203,0.5273438 0.105469,1.4296875 0.105469,2.3203125 v 0.65625 c 0,0.9023438 -0.02344,1.52343752 -0.105469,2.156250025 l 0.02344,0.035156245 c 0.140625,-0.0234375 0.410156,-0.035156245 0.5625,-0.035156245 0.140625,0 0.410156,0.011718745 0.550781,0.035156245 l 0.03516,-0.035156245 C 41.299937,-0.67985938 41.288219,-1.2423594 41.288219,-2.1564219 v -0.6210937 c 0,-0.4921875 0.152343,-0.7851563 0.410156,-1.1835938 0.164062,-0.2578125 0.445312,-0.4101562 0.667969,-0.4101562 0.246093,0 0.46875,0.023437 0.621093,0.1640625 l 0.09375,-0.023437 0.246094,-0.890625 -0.04687,-0.046875 c -0.210937,-0.058594 -0.222656,-0.082031 -0.433594,-0.082031 -0.644531,0 -0.996093,0.375 -1.523437,1.2773438 z m 3.65625,0.7382812 c 0.199218,-1.2539062 0.972656,-1.5117187 1.359375,-1.5117187 0.46875,0 0.996093,0.4453125 0.996093,1.3359375 0,0.1054687 -0.04687,0.1757812 -0.164062,0.1757812 z m 3.1875,2.015625 c -0.421875,0.45703127 -0.984375,0.65625002 -1.640625,0.65625002 -0.421875,0 -0.984375,-0.15234375 -1.300782,-0.67968752 -0.210937,-0.3398437 -0.292968,-0.8085937 -0.292968,-1.5234375 h 3.257812 c 0.128906,0 0.210938,-0.070312 0.210938,-0.1992187 0,-1.0078125 -0.492188,-2.2617188 -2.0625,-2.2617188 -1.230469,0 -2.449219,0.9960938 -2.449219,2.7539063 0,0.6796875 0.128906,1.3476562 0.539062,1.82812497 0.398438,0.50390625 1.078125,0.78515625 1.898438,0.78515625 0.855469,0 1.617187,-0.4453125 2.074219,-1.06640625 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="Hardware"
+       transform="matrix(1.3333333,0,0,1.3333333,376.21467,17.398667)" />
+    <g
+       id="g130"
+       clip-path="url(#clipPath131)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.6632,0,0,-0.6048,325.63328,121.50511)"
+         style="fill:url(#linearGradient130);stroke:none"
+         id="path130" />
+    </g>
+    <path
+       id="path132"
+       d="m 272.12953,85.04042 v 22.67752 h 62.36289 V 85.04042 Z"
+       style="fill:none;stroke:#2d782d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path133"
+       d="m 3.7843438,-6.5503438 c -1.6875,0 -3.2695313,1.453125 -3.2695313,3.4101563 0,1.7226562 0.984375,3.24609375 3.1289063,3.24609375 0.9140625,0 1.78125,-0.28125 2.4492187,-1.08984375 C 6.0812188,-1.101125 6.0695,-1.3237813 6.0343438,-1.4175313 l -0.070312,-0.023437 c -0.703125,0.77343755 -1.3007813,1.03125005 -2.2265625,1.03125005 -1.265625,0 -2.1914063,-1.35937505 -2.1914063,-2.91796875 0,-2.015625 1.2773438,-2.7539063 2.0976563,-2.7539063 0.9023437,0 1.6289062,0.3515625 2.0390625,1.171875 l 0.1171875,-0.011719 C 5.8234063,-5.4253438 5.8702813,-5.601125 5.97575,-6.023 L 5.952313,-6.058156 c 0,0 -0.9492187,-0.4921875 -2.1679692,-0.4921878 z m 4.8574218,0.9257813 c 0,-0.2929688 0.1757813,-0.4453125 0.890625,-0.4453125 0.6914064,0 1.4179684,0.1875 1.4179684,1.3828125 0,1.1367187 -0.5625,1.5117187 -1.5234371,1.5117187 -0.2578125,0 -0.65625,-0.035156 -0.7851563,-0.1054687 z M 7.7745781,-4.42925 v 2.4375 c 0,0.75 -0.011719,1.453125 -0.09375,1.9921875 l 0.023437,0.03515625 C 7.8214531,0.01215625 8.0909844,4.375e-4 8.2081719,4.375e-4 c 0.1171875,0 0.3867187,0.01171875 0.5039062,0.03515625 L 8.7355156,4.375e-4 c -0.082031,-0.5625 -0.09375,-1.2421875 -0.09375,-1.9921875 v -0.8554688 c 0.2109375,0.070312 0.4804688,0.09375 0.8320313,0.09375 1.8164061,0 2.4375001,-1.1367187 2.4375001,-1.9921875 0,-0.7382812 -0.46875,-1.7460937 -2.3085939,-1.7460937 -0.2578125,0 -1.1015625,0.070312 -1.3945312,0.070312 -0.1171875,0 -0.3867188,-0.011719 -0.5039063,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.09375,1.2421875 0.09375,1.992188 z m 10.4414059,0 v 1.6757812 c 0,1.1132813 -0.164062,2.29687505 -1.851562,2.29687505 -1.664063,0 -1.664063,-1.68750005 -1.664063,-2.22656255 V -4.42925 c 0,-0.75 0.01172,-1.4648438 0.09375,-1.9921875 l -0.02344,-0.035156 c -0.117188,0.023437 -0.386719,0.035156 -0.503906,0.035156 -0.117188,0 -0.386719,-0.011719 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.0742187 c 0,2.03906255 1.429687,2.46093755 2.320312,2.46093755 2.050782,0 2.601563,-1.26562505 2.601563,-3.04687505 V -4.42925 c 0,-0.75 0.02344,-1.4648438 0.105469,-1.9921875 l -0.02344,-0.035156 c -0.117187,0.023437 -0.222656,0.035156 -0.351562,0.035156 -0.117188,0 -0.222657,-0.011719 -0.339844,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="CPU"
+       transform="matrix(1.3333333,0,0,1.3333333,395.65733,95.530667)" />
+    <g
+       id="g137"
+       clip-path="url(#clipPath138)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.6632,0,0,-0.6048,325.63328,212.21524)"
+         style="fill:url(#linearGradient137);stroke:none"
+         id="path137" />
+    </g>
+    <path
+       id="path139"
+       d="m 272.12953,17.00783 v 22.67752 h 62.36289 V 17.00783 Z"
+       style="fill:none;stroke:#2d782d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path140"
+       d="m 1.0775,-6.4231641 c 0.082031,1.8867188 0.058594,4.8398438 -0.10546875,6.42187504 l 0.0234375,0.03515625 C 1.1126563,0.01042969 1.2298438,-0.00128906 1.3470313,-0.00128906 c 0.1171875,0 0.2226562,0.01171875 0.3515625,0.03515625 l 0.011719,-0.03515625 c -0.082031,-0.5625 -0.09375,-1.24218754 -0.09375,-1.99218754 v -2.3789062 c 0,-0.609375 0.011719,-0.6445313 0.3867188,-0.1640625 L 5.48375,-0.14191406 c 0.1054688,0.15234375 0.2578125,0.24609375 0.421875,0.24609375 0.140625,0 0.1757813,-0.12890625 0.1875,-0.31640625 C 6.14,-2.4856641 6.14,-4.2200391 6.2923438,-6.4231641 l -0.011719,-0.035156 c -0.1289062,0.023437 -0.234375,0.035156 -0.3515625,0.035156 -0.1171875,0 -0.234375,-0.011719 -0.3515625,-0.035156 l -0.023437,0.035156 c 0.082031,0.5625 0.1054688,1.2421875 0.1054688,1.9921875 v 2.6601563 c -0.023437,0.5390625 -0.1640625,0.3164062 -0.5859375,-0.2578125 l -3.421875,-4.4179688 c 0,0 -0.09375,0.023437 -0.140625,0.023437 -0.3398438,0 -0.4101563,-0.035156 -0.4101563,-0.035156 z m 9.536133,5.109375 h -0.01172 C 9.2894141,-4.3489453 8.7855078,-6.1067578 8.6917578,-6.4583203 c -0.1640625,0.023437 -0.3984375,0.035156 -0.5390625,0.035156 -0.1640625,0 -0.4453125,-0.011719 -0.5976562,-0.035156 0.4921875,1.0078125 1.8046875,4.3710937 2.7304689,6.56249999 h 0.269531 c 0.960938,-2.21484379 1.933594,-4.35937499 2.941406,-6.56249999 -0.117187,0.023437 -0.304687,0.035156 -0.398437,0.035156 -0.105469,0 -0.339844,-0.011719 -0.445313,-0.035156 -0.351562,1.2890625 -1.324218,3.4921875 -2.039062,5.1445312 z m 4.095703,-3.1171875 v 2.4375 c 0,0.75 -0.01172,1.45312504 -0.09375,1.99218754 l 0.02344,0.03515625 c 0.117188,-0.0234375 0.386719,-0.03515625 0.503907,-0.03515625 0.117187,0 0.386718,0.01171875 0.503906,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.24218754 -0.09375,-1.99218754 v -2.4375 c 0,-0.75 0.01172,-1.4648437 0.09375,-1.9921875 l -0.02344,-0.035156 c -0.117188,0.023437 -0.386719,0.035156 -0.503906,0.035156 -0.117188,0 -0.386719,-0.011719 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 3.796875,3.60937504 V -5.6145703 c 0,-0.3515625 0.339844,-0.3984375 0.832031,-0.3984375 2.15625,0 2.800781,1.6523437 2.800781,3.1875 0,2.00390624 -1.148437,2.40234374 -2.554687,2.40234374 -0.972656,0 -1.078125,-0.0820312 -1.078125,-0.3984375 z M 18.072617,-6.4231641 c -0.339844,0 -0.503906,-0.035156 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312504 -0.09375,1.99218754 l 0.01172,0.03515625 c 0,0 0.164063,-0.03515625 0.515625,-0.03515625 0.703125,0 0.796875,0.0234375 1.828125,0.0234375 1.394531,0 3.257813,-0.64453125 3.257813,-3.09375004 0,-1.8515625 -1.535157,-3.3867187 -3.445313,-3.3867187 -0.632812,0 -1.007812,0.035156 -1.640625,0.035156 z m 6.638672,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312504 -0.09375,1.99218754 l 0.02344,0.03515625 c 0.117187,-0.0234375 0.386718,-0.03515625 0.503906,-0.03515625 0.117187,0 0.386719,0.01171875 0.503906,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.24218754 -0.09375,-1.99218754 v -2.4375 c 0,-0.75 0.01172,-1.4648437 0.09375,-1.9921875 l -0.02344,-0.035156 c -0.117187,0.023437 -0.386719,0.035156 -0.503906,0.035156 -0.117188,0 -0.386719,-0.011719 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 5.953125,1.6875 c -0.328125,0.011719 -0.75,0.023437 -1.03125,0.023437 -0.292969,0 -0.832031,-0.011719 -1.054687,-0.035156 l 1.054687,-2.4375 h 0.02344 c 0.410156,0.9375 0.738281,1.7578125 1.007812,2.4492187 z m -2.273437,0.46875 c 0.246093,-0.023437 0.9375,-0.023437 1.289062,-0.023437 0.375,0 0.925781,0 1.160156,0.023437 0.492188,1.31250004 0.714844,2.08593754 0.785157,2.30859379 0.164062,-0.03515625 0.328125,-0.03515625 0.492187,-0.03515625 0.175781,0 0.433594,0 0.597656,0.03515625 -0.492187,-1.0078125 -1.816406,-4.40624999 -2.730468,-6.58593749 h -0.28125 c -0.949219,2.203125 -1.933594,4.3828125 -2.929688,6.58593749 0.117188,-0.03515625 0.234375,-0.03515625 0.339844,-0.03515625 0.117187,0 0.328125,0 0.457031,0.03515625 0.152344,-0.609375 0.46875,-1.42968749 0.820313,-2.30859379 z m 12.413086,1.1132813 v 0.44531249 c -0.339844,0.31640625 -0.925782,0.375 -1.464844,0.375 -1.769531,0 -2.425781,-1.60546879 -2.425781,-2.91796879 0,-1.7460937 0.960937,-2.859375 2.308593,-2.859375 0.890625,0 1.664063,0.5039063 2.109375,1.2539063 l 0.117188,-0.011719 c 0.03516,-0.5039062 0.09375,-0.7734375 0.199219,-1.1484375 l -0.01172,-0.035156 c 0,0 -1.054688,-0.4921875 -2.273438,-0.4921875 -1.710937,0 -3.480468,1.2070312 -3.480468,3.3984375 0,1.7226562 1.230468,3.25781249 3.234375,3.25781249 1.148437,0 1.96875,-0.33984375 2.648437,-0.90234375 v -0.0351563 c -0.09375,-0.09375 -0.105469,-0.36328134 -0.105469,-0.46875004 v -0.058594 c 0,-0.75 0.02344,-1.171875 0.105469,-1.6992187 l -0.01172,-0.035156 c 0,0 -0.175781,0.035156 -0.515625,0.035156 -0.339843,0 -0.515625,-0.035156 -0.515625,-0.035156 l -0.01172,0.035156 c 0.07031,0.5625 0.09375,1.1484375 0.09375,1.8984375 z m 3.603515,-4.4648438 c 0,-0.2929687 0.175781,-0.4453125 0.890625,-0.4453125 0.691406,0 1.417969,0.1875 1.417969,1.3828125 0,1.1367188 -0.5625,1.5117188 -1.523438,1.5117188 -0.257812,0 -0.65625,-0.035156 -0.785156,-0.1054688 z m -0.867187,1.1953125 v 2.4375 c 0,0.75 -0.01172,1.45312504 -0.09375,1.99218754 l 0.02344,0.03515625 c 0.117188,-0.0234375 0.386719,-0.03515625 0.503906,-0.03515625 0.117188,0 0.386719,0.01171875 0.503907,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.24218754 -0.09375,-1.99218754 v -0.8554687 c 0.210938,0.070312 0.480469,0.09375 0.832031,0.09375 1.816407,0 2.4375,-1.1367188 2.4375,-1.9921875 0,-0.7382813 -0.46875,-1.7460938 -2.308593,-1.7460938 -0.257813,0 -1.101563,0.070312 -1.394532,0.070312 -0.117187,0 -0.386718,-0.011719 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 10.441406,0 v 1.6757813 c 0,1.1132812 -0.164063,2.29687499 -1.851563,2.29687499 -1.664062,0 -1.664062,-1.68749999 -1.664062,-2.22656249 v -1.7460938 c 0,-0.75 0.01172,-1.4648437 0.09375,-1.9921875 l -0.02344,-0.035156 c -0.117187,0.023437 -0.386718,0.035156 -0.503906,0.035156 -0.117187,0 -0.386719,-0.011719 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.0742188 c 0,2.03906249 1.429688,2.46093749 2.320313,2.46093749 2.050781,0 2.601562,-1.26562499 2.601562,-3.04687499 v -1.4882813 c 0,-0.75 0.02344,-1.4648437 0.105469,-1.9921875 l -0.02344,-0.035156 c -0.117188,0.023437 -0.222657,0.035156 -0.351563,0.035156 -0.117187,0 -0.222656,-0.011719 -0.339844,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="NVIDIAGPU"
+       transform="matrix(1.3333333,0,0,1.3333333,371.81333,186.24)" />
+    <g
+       id="g144"
+       clip-path="url(#clipPath145)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.6632,0,0,-0.6048,325.63328,257.56997)"
+         style="fill:url(#linearGradient144);stroke:none"
+         id="path144" />
+    </g>
+    <path
+       id="path146"
+       d="M 272.12953,-17.00824 V 5.66927 h 62.36289 v -22.67751 z"
+       style="fill:none;stroke:#2d782d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path147"
+       d="m 4.0545313,-2.7407734 c -0.328125,0.011719 -0.75,0.023437 -1.03125,0.023437 -0.2929688,0 -0.8320313,-0.011719 -1.0546875,-0.035156 l 1.0546875,-2.4375 h 0.023437 c 0.4101562,0.9375 0.7382812,1.7578125 1.007813,2.449219 z m -2.2734375,0.46875 c 0.2460937,-0.023437 0.9375,-0.023437 1.2890625,-0.023437 0.375,0 0.9257812,0 1.1601562,0.023437 0.4921875,1.31249996 0.7148438,2.08593746 0.7851563,2.30859371 0.1640625,-0.03515625 0.328125,-0.03515625 0.4921875,-0.03515625 0.1757812,0 0.4335937,0 0.5976562,0.03515625 C 5.613125,-0.97124219 4.2889063,-4.3696797 3.3748438,-6.5493672 h -0.28125 C 2.144375,-4.3462422 1.16,-2.1665547 0.16390625,0.03657031 0.28109375,0.00141406 0.39828125,0.00141406 0.50375,0.00141406 c 0.1171875,0 0.328125,0 0.45703125,0.03515625 C 1.113125,-0.57280469 1.4295313,-1.3931172 1.7810938,-2.2720234 Z m 6.4072265,-4.3242188 c -0.3515625,2.015625 -0.890625,4.7695313 -1.2890625,6.63281251 0.1171875,-0.03515625 0.2109375,-0.03515625 0.328125,-0.03515625 0.1171875,0 0.1992188,0 0.3046875,0.03515625 C 7.6844141,-1.0181172 8.0242578,-3.2329609 8.2586328,-4.7446797 h 0.046875 c 0.75,1.59375 1.453125,3.1992188 2.0976562,4.74609376 h 0.1875 c 0.714844,-1.62890626 1.417969,-3.15234376 2.214844,-4.71093746 l 0.02344,0.011719 c 0.210938,1.5585938 0.410157,3.0820313 0.5625,4.73437501 0.152344,-0.03515625 0.339844,-0.03515625 0.480469,-0.03515625 0.152344,0 0.410156,0 0.5625,0.03515625 -0.363281,-2.13281251 -0.65625,-4.20703121 -0.9375,-6.63281251 H 13.25082 l -2.425781,4.9453125 h -0.07031 C 9.9578516,-3.2915547 9.2430078,-4.8853047 8.5164453,-6.5962422 Z m 8.8037107,5.77734376 V -5.6118672 c 0,-0.3515625 0.339844,-0.3984375 0.832032,-0.3984375 2.15625,0 2.800781,1.6523438 2.800781,3.1875 0,2.00390626 -1.148438,2.40234376 -2.554688,2.40234376 -0.972656,0 -1.078125,-0.0820313 -1.078125,-0.3984375 z M 16.558438,-6.4204609 c -0.339844,0 -0.503907,-0.035156 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.4375 c 0,0.75 -0.01172,1.45312496 -0.09375,1.99218746 l 0.01172,0.03515625 c 0,0 0.164062,-0.03515625 0.515625,-0.03515625 0.703125,0 0.796875,0.0234375 1.828125,0.0234375 1.394531,0 3.257812,-0.64453125 3.257812,-3.09374996 0,-1.8515625 -1.535156,-3.3867188 -3.445312,-3.3867188 -0.632813,0 -1.007813,0.035156 -1.640625,0.035156 z m 13.535156,5.2617187 v 0.44531251 c -0.339844,0.31640625 -0.925781,0.375 -1.464844,0.375 -1.769531,0 -2.425781,-1.60546871 -2.425781,-2.91796871 0,-1.7460938 0.960937,-2.859375 2.308594,-2.859375 0.890625,0 1.664062,0.5039062 2.109375,1.2539062 l 0.117187,-0.011719 c 0.03516,-0.5039063 0.09375,-0.7734375 0.199219,-1.1484375 l -0.01172,-0.035156 c 0,0 -1.054687,-0.4921875 -2.273437,-0.4921875 -1.710938,0 -3.480469,1.2070313 -3.480469,3.3984375 0,1.7226563 1.230469,3.25781251 3.234375,3.25781251 1.148437,0 1.96875,-0.33984375 2.648437,-0.90234375 v -0.0351563 c -0.09375,-0.09375 -0.105468,-0.36328116 -0.105468,-0.46874996 v -0.058594 c 0,-0.75 0.02344,-1.171875 0.105468,-1.6992189 l -0.01172,-0.035156 c 0,0 -0.175782,0.035156 -0.515625,0.035156 -0.339844,0 -0.515625,-0.035156 -0.515625,-0.035156 l -0.01172,0.035156 c 0.07031,0.5625 0.09375,1.1484376 0.09375,1.8984376 z m 3.606445,-4.4648437 c 0,-0.2929688 0.175781,-0.4453125 0.890625,-0.4453125 0.691406,0 1.417969,0.1875 1.417969,1.3828125 0,1.1367187 -0.5625,1.5117187 -1.523438,1.5117187 -0.257812,0 -0.65625,-0.035156 -0.785156,-0.1054687 z m -0.867187,1.1953125 v 2.4375 c 0,0.75 -0.01172,1.45312496 -0.09375,1.99218746 l 0.02344,0.03515625 c 0.117188,-0.0234375 0.386719,-0.03515625 0.503906,-0.03515625 0.117188,0 0.386719,0.01171875 0.503907,0.03515625 l 0.02344,-0.03515625 c -0.08203,-0.5625 -0.09375,-1.24218746 -0.09375,-1.99218746 v -0.8554688 c 0.210938,0.070312 0.480469,0.09375 0.832031,0.09375 1.816407,0 2.4375,-1.1367187 2.4375,-1.9921875 0,-0.7382812 -0.46875,-1.7460937 -2.308593,-1.7460937 -0.257813,0 -1.101563,0.070312 -1.394532,0.070312 -0.117187,0 -0.386718,-0.011719 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 10.438476,0 v 1.6757812 c 0,1.1132813 -0.164062,2.29687501 -1.851562,2.29687501 -1.664063,0 -1.664063,-1.68750001 -1.664063,-2.22656251 v -1.7460937 c 0,-0.75 0.01172,-1.4648438 0.09375,-1.9921875 l -0.02344,-0.035156 c -0.117188,0.023437 -0.386719,0.035156 -0.503907,0.035156 -0.117187,0 -0.386718,-0.011719 -0.503906,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.0742187 c 0,2.03906251 1.429687,2.46093751 2.320312,2.46093751 2.050781,0 2.601563,-1.26562501 2.601563,-3.04687501 v -1.4882812 c 0,-0.75 0.02344,-1.4648438 0.105468,-1.9921875 l -0.02344,-0.035156 c -0.117188,0.023437 -0.222656,0.035156 -0.351563,0.035156 -0.117187,0 -0.222656,-0.011719 -0.339843,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="AMDGPU"
+       transform="matrix(1.3333333,0,0,1.3333333,378.95333,231.69733)" />
+    <g
+       id="g151"
+       clip-path="url(#clipPath152)">
+      <path
+         d="M 0,0 H 100 V 100 H 0 Z"
+         transform="matrix(1.6632,0,0,-0.6048,325.63328,302.92473)"
+         style="fill:url(#linearGradient151);stroke:none"
+         id="path151" />
+    </g>
+    <path
+       id="path153"
+       d="m 272.12953,-51.02432 v 22.67752 h 62.36289 v -22.67752 z"
+       style="fill:none;stroke:#2d782d;stroke-width:0.3985;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:10;stroke-dasharray:none;stroke-opacity:1"
+       transform="matrix(1.3333333,0,0,-1.3333333,4.3786667,219.77067)" />
+    <path
+       id="path154"
+       d="m 1.0306094,-4.4295391 v 2.4375 c 0,0.75 -0.011719,1.45312504 -0.09375,1.9921875375 L 0.96029687,0.03530469 C 1.0774844,0.01186719 1.3470156,1.484375e-4 1.4642031,1.484375e-4 c 0.1171875,0 0.3867188,0.0117187525 0.5039063,0.0351562525 L 1.9915469,1.484375e-4 C 1.9095156,-0.56235156 1.8977969,-1.2420391 1.8977969,-1.9920391 v -2.4375 c 0,-0.75 0.011719,-1.4648437 0.09375,-1.9921875 l -0.023437,-0.035156 c -0.1171875,0.023437 -0.3867188,0.035156 -0.5039063,0.035156 -0.1171875,0 -0.3867187,-0.011719 -0.50390623,-0.035156 l -0.0234375,0.035156 c 0.0820312,0.5625 0.09375,1.2421875 0.0937495,1.9921875 z m 6.4804687,2.6367188 c 0,-0.3164063 0.023437,-0.6914063 0.023437,-1.0078125 0,-1.03125 -0.3398437,-1.5703125 -1.2421875,-1.5703125 -0.375,0 -1.078125,0.1523437 -1.7226562,0.890625 l -0.011719,-0.023437 v -0.7265625 c -0.011719,-0.070312 -0.023437,-0.1054688 -0.070312,-0.1054688 -0.2226562,0.046875 -0.5859375,0.058594 -0.796875,0.035156 l -0.023437,0.023437 c 0.070312,0.4453125 0.09375,1.1835937 0.09375,1.9335937 v 0.5507813 c 0,0.75 -0.011719,1.25390624 -0.09375,1.7929687375 L 3.6907656,0.03530469 c 0.1171875,-0.0234375 0.3398438,-0.0351562525 0.46875,-0.0351562525 0.1171875,0 0.3398438,0.0117187525 0.46875,0.0351562525 L 4.6399844,1.484375e-4 C 4.5579531,-0.56235156 4.5579531,-1.0311016 4.5579531,-1.7928203 v -1.1601563 c 0.5039063,-0.5976562 1.078125,-0.8203125 1.4414063,-0.8203125 0.5039062,0 0.7148437,0.1992188 0.7148437,1.0078125 v 0.9726563 c 0,0.75 -0.023437,1.26562499 -0.09375,1.7929687375 L 6.6438906,0.03530469 c 0.1171875,-0.0234375 0.3515625,-0.0351562525 0.46875,-0.0351562525 0.1171875,0 0.3515625,0.0117187525 0.46875,0.0351562525 L 7.6048281,1.484375e-4 C 7.5227969,-0.56235156 7.5110781,-1.0311016 7.5110781,-1.7928203 Z m 1.7373047,-2.484375 c -0.2109375,0 -0.421875,0 -0.5390625,-0.011719 -0.070312,0.1640625 -0.1289062,0.2578125 -0.2226562,0.3984375 l 0.046875,0.070312 c 0.1757812,0 0.4804687,0 0.7148437,-0.011719 v 1.359375 c 0,0.5976562 -0.035156,1.3125 -0.035156,1.61718749 0,0.64453125 0.421875,0.9609375 0.8789062,0.9609375 0.421875,0 0.738281,-0.1054687525 1.160156,-0.375 l -0.128906,-0.234375 c -0.304688,0.09375 -0.527344,0.10546875 -0.785156,0.0703125 -0.234375,-0.0351563 -0.328125,-0.26953125 -0.328125,-0.82031249 0,-0.3046875 0.03516,-0.7265625 0.03516,-1.3242188 v -1.2539062 h 0.375 c 0.234375,0 0.574219,0.011719 0.714844,0.011719 0.03516,-0.1523437 0.05859,-0.2578125 0.117187,-0.3984375 l -0.04687,-0.070312 c -0.175781,0 -0.46875,0.011719 -0.691406,0.011719 h -0.46875 c 0,-0.7382813 0,-0.8671875 0.04687,-1.453125 0,-0.070312 -0.03516,-0.09375 -0.09375,-0.09375 -0.2460937,0.09375 -0.375,0.2109375 -0.6914062,0.2460937 l -0.023437,0.035156 c -0.023437,0.3398437 -0.035156,0.6914062 -0.035156,1.265625 z m 3.6796872,1.5585937 c 0.175782,-1.03125 0.820313,-1.2539062 1.136719,-1.2539062 0.386719,0 0.84375,0.3632812 0.84375,1.1132812 0,0.09375 -0.04687,0.140625 -0.140625,0.140625 z m 2.660157,1.6875 c -0.351563,0.37500004 -0.808594,0.53906254 -1.359375,0.53906254 -0.351563,0 -0.820313,-0.12890625 -1.089844,-0.56250004 -0.175781,-0.28125 -0.234375,-0.6796875 -0.234375,-1.265625 h 2.707031 c 0.105469,0 0.175781,-0.058594 0.175781,-0.1757812 0,-0.8320313 -0.410156,-1.875 -1.722656,-1.875 -1.019531,0 -2.039062,0.8203125 -2.039062,2.2851562 0,0.5742188 0.105468,1.12500004 0.445312,1.52343754 0.339844,0.421875 0.914063,0.66796875 1.582031,0.66796875 0.726563,0 1.359375,-0.375 1.734375,-0.890625 z m 1.514648,-0.9609375 c 0,0.75 -0.02344,1.46484379 -0.09375,1.9921875375 l 0.02344,0.0351562525 c 0.117188,-0.0234375 0.339844,-0.0351562525 0.46875,-0.0351562525 0.117188,0 0.339844,0.0117187525 0.457032,0.0351562525 L 17.981781,1.484375e-4 C 17.89975,-0.56235156 17.89975,-1.2303203 17.89975,-1.9920391 v -2.9882812 c 0,-0.75 0.03516,-1.2304688 0.08203,-1.875 0,-0.070312 -0.02344,-0.09375 -0.08203,-0.09375 -0.257813,0.09375 -0.46875,0.1640625 -0.867188,0.1992187 l -0.02344,0.023437 c 0.07031,0.4335938 0.09375,1.171875 0.09375,1.921875 z m 9.536133,0.8320313 v 0.44531249 c -0.339844,0.31640625 -0.925781,0.375 -1.464844,0.375 -1.769531,0 -2.425781,-1.60546879 -2.425781,-2.91796879 0,-1.7460937 0.960937,-2.859375 2.308594,-2.859375 0.890625,0 1.664062,0.5039063 2.109375,1.2539063 l 0.117187,-0.011719 c 0.03516,-0.5039062 0.09375,-0.7734375 0.199219,-1.1484375 l -0.01172,-0.035156 c 0,0 -1.054687,-0.4921875 -2.273437,-0.4921875 -1.710938,0 -3.480469,1.2070312 -3.480469,3.3984375 0,1.7226562 1.230469,3.25781249 3.234375,3.25781249 1.148437,0 1.96875,-0.33984375 2.648437,-0.90234375 v -0.0351563 c -0.09375,-0.09375 -0.105468,-0.36328134 -0.105468,-0.46875004 v -0.058594 c 0,-0.75 0.02344,-1.171875 0.105468,-1.6992187 l -0.01172,-0.035156 c 0,0 -0.175782,0.035156 -0.515625,0.035156 -0.339844,0 -0.515625,-0.035156 -0.515625,-0.035156 l -0.01172,0.035156 c 0.07031,0.5625 0.09375,1.1484375 0.09375,1.8984375 z m 3.603515,-4.4648438 c 0,-0.2929687 0.175782,-0.4453125 0.890625,-0.4453125 0.691407,0 1.417969,0.1875 1.417969,1.3828125 0,1.1367188 -0.5625,1.5117188 -1.523437,1.5117188 -0.257813,0 -0.65625,-0.035156 -0.785157,-0.1054688 z m -0.867187,1.1953125 v 2.4375 c 0,0.75 -0.01172,1.45312504 -0.09375,1.9921875375 l 0.02344,0.0351562525 c 0.117188,-0.0234375 0.386719,-0.0351562525 0.503907,-0.0351562525 0.117187,0 0.386718,0.0117187525 0.503906,0.0351562525 L 30.336273,1.484375e-4 C 30.254242,-0.56235156 30.242523,-1.2420391 30.242523,-1.9920391 v -0.8554687 c 0.210938,0.070312 0.480469,0.09375 0.832032,0.09375 1.816406,0 2.4375,-1.1367188 2.4375,-1.9921875 0,-0.7382813 -0.46875,-1.7460938 -2.308594,-1.7460938 -0.257813,0 -1.101563,0.070312 -1.394531,0.070312 -0.117188,0 -0.386719,-0.011719 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 10.438476,0 v 1.6757813 c 0,1.1132812 -0.164062,2.29687499 -1.851562,2.29687499 -1.664063,0 -1.664063,-1.68749999 -1.664063,-2.22656249 v -1.7460938 c 0,-0.75 0.01172,-1.4648437 0.09375,-1.9921875 l -0.02344,-0.035156 c -0.117188,0.023437 -0.386719,0.035156 -0.503906,0.035156 -0.117188,0 -0.386719,-0.011719 -0.503907,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 v 2.0742188 c 0,2.03906249 1.429687,2.46093749 2.320312,2.46093749 2.050782,0 2.601563,-1.26562499 2.601563,-3.04687499 v -1.4882813 c 0,-0.75 0.02344,-1.4648437 0.105469,-1.9921875 l -0.02344,-0.035156 c -0.117187,0.023437 -0.222656,0.035156 -0.351562,0.035156 -0.117188,0 -0.222657,-0.011719 -0.339844,-0.035156 l -0.02344,0.035156 c 0.08203,0.5625 0.09375,1.2421875 0.09375,1.9921875 z m 0,0"
+       style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none"
+       aria-label="IntelGPU"
+       transform="matrix(1.3333333,0,0,1.3333333,381.25867,277.26933)" />
+  </g>
 </svg>
diff --git a/doc/img/libCEEDBackends.tex b/doc/img/libCEEDBackends.tex
new file mode 100644
index 0000000000..a1b9d28652
--- /dev/null
+++ b/doc/img/libCEEDBackends.tex
@@ -0,0 +1,192 @@
+\documentclass[tikz]{standalone}
+\usepackage{tikz}
+\usepackage{pgfplots}
+\usepackage{pgfmath}
+\usepackage{libertine}
+\usetikzlibrary{calc}
+
+\renewcommand{\familydefault}{\sfdefault}
+
+\definecolor{ceed@blue}{RGB}{100,150,230}
+\definecolor{ceed@green}{RGB}{75,200,75}
+\definecolor{ceed@red}{RGB}{200,75,75}
+\definecolor{ceed@orange}{RGB}{252,186,3}
+
+\pgfplotsset{compat=1.18}
+
+\begin{document}
+
+\begin{tikzpicture}
+
+\begin{scope}[shift={(0,-0.6)}]
+  \node at (1.0,6.1) {\large Application};
+
+  % PETSc
+  \draw[
+    top color=ceed@red!10!white,
+    bottom color=ceed@red!40!white,
+    ceed@red!60!black,
+  ] (0.0,3.0) rectangle ++(1.6,0.8)
+  node[pos=.5,align=center,color=black] {PETSc};
+  \draw[-stealth, line width=0.5pt] (1.6, 3.0+0.4) -- ++(1.6,-1.2-0.4);
+
+  % Ratel
+  \draw[
+    top color=ceed@red!10!white,
+    bottom color=ceed@red!40!white,
+    ceed@red!60!black,
+  ] (0.0,1.8) rectangle ++(1.6,0.8)
+  node[pos=.5,align=center,color=black] {Ratel};
+  \draw[-stealth, line width=0.5pt] (1.6, 1.8+0.4) -- ++(1.6,0.0-0.55);
+
+  % HONEE
+  \draw[
+    top color=ceed@red!10!white,
+    bottom color=ceed@red!40!white,
+    ceed@red!60!black,
+  ] (0.0,0.6) rectangle ++(1.6,0.8)
+  node[pos=.5,align=center,color=black] {HONEE};
+  \draw[-stealth, line width=0.5pt] (1.6, 0.6+0.4) -- ++(1.6,1.2-0.65);
+
+  % MFEM
+  \draw[
+    top color=ceed@red!10!white,
+    bottom color=ceed@red!40!white,
+    ceed@red!60!black,
+  ] (0.0,-0.6) rectangle ++(1.6,0.8)
+  node[pos=.5,align=center,color=black] {MFEM};
+  \draw[-stealth, line width=0.5pt] (1.6, -0.6+0.4) -- ++(1.6,2.4-0.8);
+\end{scope}
+
+\begin{scope}[shift={(3.2,0)}]
+  \begin{scope}[shift={(0,-0.6)}]
+  \node at (0.8,6.1) {\large Library};
+    \draw[
+      top color=ceed@blue!10!white,
+      bottom color=ceed@blue!40!white,
+      ceed@blue!60!black,
+    ] (0.0,1.2) rectangle ++(1.6,0.8)
+    node[pos=.5,align=center,color=black] {libCEED};
+
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,3.6);
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,2.4);
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,1.2);
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,0.0);
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,-1.2);
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,-2.4);
+    \draw[-stealth, line width=0.5pt] (1.6, 1.6) -- ++(1.6,-3.6);
+  \end{scope}
+\end{scope}
+
+\begin{scope}[shift={(6.4,0)}]
+  \begin{scope}[shift={(0,-0.6)}]
+    \node at (0.95,6.1) {\large Backends};
+
+    % C
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,4.8) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {Pure C};
+    \draw[-stealth, line width=0.5pt] (2.0, 5.2) -- ++(1.2,-1.2+0.15);
+
+    % AVX
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,3.6) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {AVX};
+    \draw[-stealth, line width=0.5pt] (2.0, 4.0) -- ++(1.2,+0.0+0.025);
+
+    % LIBXSMM
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,2.4) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {LIBXSMM};
+    \draw[-stealth, line width=0.5pt] (2.0, 2.8) -- ++(1.2,1.1-0.025);
+
+    % CUDA
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,1.2) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {CUDA};
+    \draw[-stealth, line width=0.5pt] (2.0, 1.6) -- ++(1.2,0.0+0.025);
+
+    % HIP
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,0.0) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {HIP};
+    \draw[-stealth, line width=0.5pt] (2.0, 0.4) -- ++(1.2,0.0+0.025);
+
+    % SYCL
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,-1.2) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {SYCL};
+    \draw[-stealth, line width=0.5pt] (2.0, -0.8) -- ++(1.2,0.0+0.025);
+
+    % MAGMA
+    \draw[
+      top color=black!5!white,
+      bottom color=black!20!white,
+      black!80!white,
+    ] (0.0,-2.4) rectangle ++(2.0,0.8)
+    node[pos=.5,align=center,color=black] {MAGMA};
+    \draw[-stealth, line width=0.5pt] (2.0, -2.0) -- ++(1.2,3.7-0.15);
+    \draw[-stealth, line width=0.5pt] (2.0, -2.0) -- ++(1.2,2.5-0.15);
+
+  \end{scope}
+\end{scope}
+
+\begin{scope}[shift={(9.6,0)}]
+  \begin{scope}[shift={(0,-0.6)}]
+    \node at (1.1,6.1) {\large Hardware};
+
+    % CPU
+    \draw[
+      top color=ceed@green!20!white,
+      bottom color=ceed@green!60!white,
+      ceed@green!60!black,
+    ] (0.0,3.6) rectangle ++(2.2,0.8)
+    node[pos=.5,align=center,color=black] {CPU};
+
+    % CUDA GPU
+    \draw[
+      top color=ceed@green!20!white,
+      bottom color=ceed@green!60!white,
+      ceed@green!60!black,
+    ] (0.0,1.2) rectangle ++(2.2,0.8)
+    node[pos=.5,align=center,color=black] {NVIDIA GPU};
+
+    % ROCm GPU
+    \draw[
+      top color=ceed@green!20!white,
+      bottom color=ceed@green!60!white,
+      ceed@green!60!black,
+    ] (0.0,-0.0) rectangle ++(2.2,0.8)
+    node[pos=.5,align=center,color=black] {AMD GPU};
+
+    % Intel GPU
+    \draw[
+      top color=ceed@green!20!white,
+      bottom color=ceed@green!60!white,
+      ceed@green!60!black,
+    ] (0.0,-1.2) rectangle ++(2.2,0.8)
+    node[pos=.5,align=center,color=black] {Intel GPU};
+
+  \end{scope}
+\end{scope}
+
+\end{tikzpicture}
+\end{document}
diff --git a/doc/sphinx/requirements.txt b/doc/sphinx/requirements.txt
index 76b40ca3ab..f4f8145a5c 100644
--- a/doc/sphinx/requirements.txt
+++ b/doc/sphinx/requirements.txt
@@ -1,11 +1,11 @@
 altair>=5.0
-breathe>=4.30
-myst-parser[linkify]>=0.14.0
-sphinx-hoverxref>=0.3b1
+breathe>=4.36
+myst-parser[linkify]>=4.0.1
+sphinx-hoverxref>=1.4.2
 sphinx-design
-sphinx>=5.3,<6
+sphinx>=7.2
 sphinx_rtd_theme
-sphinxcontrib-bibtex==2.5
+sphinxcontrib-bibtex==2.6.3
 sphinxcontrib-katex
 sphinxcontrib-mermaid
 sphinxcontrib-svg2pdfconverter
diff --git a/doc/sphinx/source/conf.py b/doc/sphinx/source/conf.py
index 2dbcfa648a..3f956d3785 100755
--- a/doc/sphinx/source/conf.py
+++ b/doc/sphinx/source/conf.py
@@ -40,7 +40,6 @@
 extensions = [
     "sphinxext_altair.altairplot",
     "breathe",
-    "hoverxref.extension",
     "sphinx_design",
     "myst_parser",
     "sphinx_rtd_theme",
@@ -107,6 +106,7 @@
     "examples/nek/README.md",
     "examples/petsc/README.md",
     "examples/solid/README.md",
+    "examples/deal.II/README.md",
 ]
 
 # The name of the Pygments (syntax highlighting) style to use.
@@ -161,13 +161,6 @@
     ]
 }
 
-# hoverxref options
-hoverxref_auto_ref = True
-hoverxref_mathjax = True
-hoverxref_role_types = {
-    "ref": "modal",
-}
-
 latex_macros = r"""
 \def \diff {\operatorname{d}\!}
 \def \tcolon {\!:\!}
diff --git a/doc/sphinx/source/gettingstarted.md b/doc/sphinx/source/gettingstarted.md
index 0f1a831b59..0369bbaa40 100644
--- a/doc/sphinx/source/gettingstarted.md
+++ b/doc/sphinx/source/gettingstarted.md
@@ -1,5 +1,6 @@
 # Getting Started
 
 ```{include} ./README.md
-:start-after: gettingstarted-inclusion-marker
+:start-after: <!-- getting-started-inclusion -->
+:end-before: <!-- getting-started-exclusion -->
 ```
diff --git a/doc/sphinx/source/gpu.md b/doc/sphinx/source/gpu.md
index f7418ac5f3..6040e52a8d 100644
--- a/doc/sphinx/source/gpu.md
+++ b/doc/sphinx/source/gpu.md
@@ -7,11 +7,12 @@ Code that produces correct results with CPU backends will produce correct result
 
 The filepath to the user source code is passed in {c:func}`CeedQFunctionCreateInterior` as the `source` argument.
 This filepath should typically be an absolute path to ensure the JiT compilation can locate the source file.
-The filepath may also be relative to a root directory set with {c:func}`CeedAddJitSourceRoot`.
-The {c:macro}`CEED_QFUNCTION` macro automatically creates a string with the absolute path stored in the variable `user_loc` for a {c:type}`CeedQFunctionUser` called `user`.
+The filepath may also be a relative path with respect to a root directory set with {c:func}`CeedAddJitSourceRoot`.
+The {c:macro}`CEED_QFUNCTION` macro automatically creates a string with the absolute path, for example a {c:type}`CeedQFunctionUser` called `user` would have this string stored in the variable `user_loc`.
 
-The entire contents of this file and all locally included files (`#include "foo.h"`) are used during JiT compilation for GPU backends.
-Installed headers (`#include <bar.h>`) are omitted in the source code passed to JiT, but the compilation environment may supply common headers such as `<math.h>`.
+The entire contents of this source file and all included files are used during JiT compilation for GPU backends.
+Include statements for system headers that are required for CPU compilation but are not available in GPU compilation environments should be guarded with `#ifdef CEED_RUNNING_JIT_PASS`.
+Any function definitions in these system headers must still be available in the GPU compilation environments, such as the contents of `<math.h>`.
 These source file must only contain syntax constructs supported by C99 and all targeted backends (i.e. CUDA for `/gpu/cuda`, OpenCL/SYCL for `/gpu/sycl`, etc.).
 
 All source files must be at the provided filepath at runtime for JiT to function.
@@ -20,10 +21,10 @@ All source files must be at the provided filepath at runtime for JiT to function
 
 GPU backends require stricter adherence to memory access assumptions, but CPU backends may occasionally report correct results despite violations of memory access assumptions.
 Both `CeedVector` and `CeedQFunctionContext` have read-only and read-write accessors, and `CeedVector` allow write-only access.
-Read-only access of `CeedVector` and `CeedQFunctionContext` memory spaces must be respected for proper GPU behavior.
+Read-only access of `CeedVector` and `CeedQFunctionContext` memory spaces must be respected for to ensure proper GPU behavior.
 Write-only access of `CeedVector` memory spaces asserts that all data in the `CeedVector` is invalid until overwritten.
 
-`CeedQFunction` assume that all input arrays are read-only and all output arrays are write-only and the {c:type}`CeedQFunctionUser` must adhere to these assumptions, only reading data in the input arrays and fully overwriting the output arrays.
+`CeedQFunction` assume that all input arrays are read-only and all output arrays are write-only and the {c:type}`CeedQFunctionUser` must adhere to these assumptions, only reading data in the input arrays and fully overwriting all entries in the output arrays.
 Additionally, {c:type}`CeedQFunctionUser` have read-write access for `CeedQFunctionContext` data, unless {c:func}`CeedQFunctionSetContextWritable` was used to indicate that read-only access is sufficient.
 
 The `/cpu/self/memcheck` backends explicitly verify read-only and write-only memory access assumptions.
diff --git a/doc/sphinx/source/index.md b/doc/sphinx/source/index.md
index 82272c54d4..f35c1bb5a0 100644
--- a/doc/sphinx/source/index.md
+++ b/doc/sphinx/source/index.md
@@ -8,9 +8,9 @@ intro
 gettingstarted
 libCEEDapi
 examples/index
+api/index
 ffi
 gpu
-api/index
 precision
 libCEEDdev
 Contributing <CONTRIBUTING>
diff --git a/doc/sphinx/source/intro.md b/doc/sphinx/source/intro.md
index 3c0d04e1ef..574672b31d 100644
--- a/doc/sphinx/source/intro.md
+++ b/doc/sphinx/source/intro.md
@@ -22,7 +22,7 @@ Furthermore, software packages that provide high-performance implementations hav
 libCEED's purely algebraic interface can unobtrusively be integrated in new and legacy software to provide performance portable interfaces.
 While libCEED's focus is on high-order finite elements, the approach is algebraic and thus applicable to other discretizations in factored form.
 libCEED's role, as a lightweight portable library that allows a wide variety of applications to share highly optimized discretization kernels, is illustrated in {numref}`fig-libCEED-backends`, where a non-exhaustive list of specialized implementations (backends) is provided.
-libCEED provides a low-level Application Programming Interface (API) for user codes so that applications with their own discretization infrastructure (e.g., those in [PETSc](https://www.mcs.anl.gov/petsc/), [MFEM](https://mfem.org/) and [Nek5000](https://nek5000.mcs.anl.gov/)) can evaluate and use the core operations provided by libCEED. GPU implementations are available via pure [CUDA](https://developer.nvidia.com/about-cuda) and pure [HIP](https://rocmdocs.amd.com) as well as the [OCCA](http://github.com/libocca/occa) and [MAGMA](https://bitbucket.org/icl/magma) libraries.
+libCEED provides a low-level Application Programming Interface (API) for user codes so that applications with their own discretization infrastructure (e.g., those in [PETSc](https://www.mcs.anl.gov/petsc/), [MFEM](https://mfem.org/) and [Nek5000](https://nek5000.mcs.anl.gov/)) can evaluate and use the core operations provided by libCEED. GPU implementations are available via pure [CUDA](https://developer.nvidia.com/about-cuda) and pure [HIP](https://rocmdocs.amd.com) as well as the [MAGMA](https://bitbucket.org/icl/magma) library.
 CPU implementations are available via pure C and AVX intrinsics as well as the [LIBXSMM](http://github.com/hfp/libxsmm) library.
 libCEED provides a unified interface, so that users only need to write a single source code and can select the desired specialized implementation at run time. Moreover, each process or thread can instantiate an arbitrary number of backends.
 
diff --git a/doc/sphinx/source/libCEEDapi.md b/doc/sphinx/source/libCEEDapi.md
index b43871e422..66caa15688 100644
--- a/doc/sphinx/source/libCEEDapi.md
+++ b/doc/sphinx/source/libCEEDapi.md
@@ -259,7 +259,7 @@ If greater than 1, the caller must ensure that the number of quadrature points `
 This is often satisfied automatically due to the element size or by batching elements together to facilitate vectorization in other stages, and can always be ensured by padding.
 
 In addition to the function pointers (`setup` and `mass`), {ref}`CeedQFunction` constructors take a string representation specifying where the source for the implementation is found.
-This is used by backends that support Just-In-Time (JIT) compilation (i.e., CUDA and OCCA) to compile for coprocessors.
+This is used by backends that support Just-In-Time (JIT) compilation (i.e., CUDA and HIP) to compile for coprocessors.
 For full support across all backends, these {ref}`CeedQFunction` source files must only contain constructs mutually supported by C99, C++11, and CUDA.
 For example, explicit type casting of void pointers and explicit use of compatible arguments for {code}`math` library functions is required, and variable-length array (VLA) syntax for array reshaping is only available via libCEED's {code}`CEED_Q_VLA` macro.
 
diff --git a/doc/sphinx/source/libCEEDdev.md b/doc/sphinx/source/libCEEDdev.md
index 7a009ee811..e311171a79 100644
--- a/doc/sphinx/source/libCEEDdev.md
+++ b/doc/sphinx/source/libCEEDdev.md
@@ -1,55 +1,150 @@
 # Developer Notes
 
-## Style Guide
+## Library Design
 
-Please check your code for style issues by running
+LibCEED has a single user facing API for creating and using the libCEED objects ({ref}`CeedVector`, {ref}`CeedBasis`, etc).
+Different Ceed backends are selected by instantiating a different {ref}`Ceed` object to create the other libCEED objects, in a [bridge pattern](https://en.wikipedia.org/wiki/Bridge_pattern).
+At runtime, the user can select the different backend implementations to target different hardware, such as CPUs or GPUs.
 
-`make format`
+When designing new features, developers should place the function definitions for the user facing API in the header `/include/ceed/ceed.h`.
+The basic implementation of these functions should typically be placed in `/interface/*.c` files.
+The interface should pass any computationally expensive or hardware specific operations to a backend implementation.
+A new method for the associated libCEED object can be added in `/include/ceed-impl.h`, with a corresponding `CEED_FTABLE_ENTRY` in `/interface/ceed.c` to allow backends to set their own implementations of this method.
+Then in the creation of the backend specific implementation of the object, typically found in `/backends/[impl]/ceed-[impl]-[object].c`, the developer creates the backend implementation of the specific method and calls {c:func}`CeedSetBackendFunction` to set this implementation of the method for the backend.
+Any supplemental functions intended to be used in the interface or by the backends may be added to the backend API in the header `/include/ceed/backend.h`.
+The basic implementation of these functions should also be placed in `/interface/*.c` files.
 
-In addition to those automatically enforced style rules, libCEED tends to follow the following code style conventions:
+LibCEED generally follows a "CPU first" implementation strategy when adding new functionality to the user facing API.
+If there are no performance specific considerations, it is generally recommended to include a basic CPU default implementation in `/interface/*.c`.
+Any new functions must be well documented and tested.
+Once the user facing API and the default implementation are in place and verified correct via tests, then the developer can focus on hardware specific implementations (AVX, CUDA, HIP, etc.) as necessary.
 
-- Variable names: `snake_case`
-- Strut members: `snake_case`
-- Function and method names: `PascalCase` or language specific style
-- Type names: `PascalCase` or language specific style
-- Constant names: `CAPS_SNAKE_CASE` or language specific style
+## Backend Inheritance
 
-Also, documentation files should have one sentence per line to help make git diffs clearer and less disruptive.
+A Ceed backend is not required to implement all libCeed objects or {ref}`CeedOperator` methods.
+There are three mechanisms by which a Ceed backend can inherit implementations from another Ceed backend.
 
-## Clang-tidy
+1. Delegation - Developers may use {c:func}`CeedSetDelegate` to set a general delegate {ref}`Ceed` object.
+   This delegate {ref}`Ceed` will provide the implementation of any libCeed objects that parent backend does not implement.
+   For example, the `/cpu/self/xsmm/serial` backend implements the `CeedTensorContract` object itself but delegates all other functionality to the `/cpu/self/opt/serial` backend.
 
-Please check your code for common issues by running
+2. Object delegation  - Developers may use {c:func}`CeedSetObjectDelegate` to set a delegate {ref}`Ceed` object for a specific libCEED object.
+   This delegate {ref}`Ceed` will only provide the implementation of that specific libCeed object for the parent backend.
+   Object delegation has higher precedence than delegation.
 
-`make tidy`
+3. Operator fallback - Developers may use {c:func}`CeedSetOperatorFallbackCeed` to set a {ref}`Ceed` object to provide any unimplemented {ref}`CeedOperator` methods that support preconditioning, such as {c:func}`CeedOperatorLinearAssemble`.
+   The parent backend must implement the basic {ref}`CeedOperator` functionality.
+   Like the delegates above, this fallback {ref}`Ceed` object should be created and set in the backend `CeedInit` function.
+   In order to use operator fallback, the parent backend and fallback backend must use compatible E-vector and Q-vector layouts.
+   For example, `/gpu/cuda/gen` falls back to `/gpu/cuda/ref` for missing {ref}`CeedOperator` preconditioning support methods.
+   If an unimplemented method is called, then the parent `/gpu/cuda/gen` {ref}`Ceed` object uses its fallback `/gpu/cuda/ref` {ref}`Ceed` object to create a clone of the {ref}`CeedOperator`.
+   This clone {ref}`CeedOperator` is then used for the unimplemented preconditioning support methods.
 
-which uses the `clang-tidy` utility included in recent releases of Clang.
-This tool is much slower than actual compilation (`make -j8` parallelism helps).
-To run on a single file, use
+## Backend Families
 
-`make interface/ceed.c.tidy`
+There are 4 general 'families' of backend implementations.
+As internal data layouts are specific to backend families, it is generally not possible to delegate between backend families.
 
-for example.
-All issues reported by `make tidy` should be fixed.
+### CPU Backends
 
-## Include-What-You-Use
+The basic CPU with the simplest implementation is `/cpu/self/ref/serial`.
+This backend contains the basic implementations of most objects that other backends rely upon.
+Most of the other CPU backends only update the {ref}`CeedOperator` and `CeedTensorContract` objects.
 
-Header inclusion for source files should follow the principal of 'include what you use' rather than relying upon transitive `#include` to define all symbols.
+The `/cpu/self/ref/blockend` and `/cpu/self/opt/*` backends delegate to the `/cpu/self/ref/serial` backend.
+The `/cpu/self/ref/blocked` backend updates the {ref}`CeedOperator` to use an E-vector and Q-vector ordering when data for 8 elements are interlaced to provide better vectorization.
+The `/cpu/self/opt/*` backends update the {ref}`CeedOperator` to apply the action of the operator in 1 or 8 element batches, depending upon if the blocking strategy is used.
+This reduced the memory required to utilize this backend significantly.
 
-Every symbol that is used in the source file `foo.c` should be defined in `foo.c`, `foo.h`, or in a header file `#include`d in one of these two locations.
-Please check your code by running the tool [`include-what-you-use`](https://include-what-you-use.org/) to see recommendations for changes to your source.
-Most issues reported by `include-what-you-use` should be fixed; however this rule is flexible to account for differences in header file organization in external libraries.
-If you have `include-what-you-use` installed in a sibling directory to libCEED or set the environment variable `IWYU_CC`, then you can use the makefile target `make iwyu`.
+The `/cpu/self/avx/*` and `/cpu/self/xsmm/*` backends delegate to the corresponding `/cpu/self/opt/*` backends.
+These backends update the `CeedTensorContract` objects using AVX intrinsics and libXSMM functions, respectively.
 
-Header files should be listed in alphabetical order, with installed headers preceding local headers and `ceed` headers being listed first.
-The `ceed-f64.h` and `ceed-f32.h` headers should only be included in `ceed.h`.
+The `/cpu/self/memcheck/*` backends delegate to the `/cpu/self/ref/*` backends.
+These backends replace many of the implementations with methods that include more verification checks and a memory management model that more closely matches the memory management for GPU backends.
+These backends rely upon the [Valgrind](https://valgrind.org/) Memcheck tool and Valgrind headers.
 
-```c
-#include <ceed.h>
-#include <ceed/backend.h>
-#include <stdbool.h>
-#include <string.h>
-#include "ceed-avx.h"
-```
+### GPU Backends
+
+The CUDA, HIP, and SYCL backend families all follow similar designs.
+The CUDA and HIP backends are very similar, with minor differences.
+While the SYCL backend was based upon the CUDA and HIP backends, there are more internal differences to accommodate OpenCL and Intel hardware.
+
+The `/gpu/*/ref` backends provide basic functionality.
+In these backends, the operator is applied in multiple separate kernel launches, following the libCEED operator decomposition, where first {ref}`CeedElemRestriction` kernels map from the L-vectors to E-vectors, then {ref}`CeedBasis` kernels map from the E-vectors to Q-vectors, then the {ref}`CeedQFunction` kernel provides the action of the user quadrature point function, and the transpose {ref}`CeedBasis` and {ref}`CeedElemRestriction` kernels are applied to go back to the E-vectors and finally the L-vectors.
+These kernels apply to all points across all elements in order to maximize the amount of work each kernel launch has.
+Some of these kernels are compiled at runtime via NVRTC, HIPRTC, or OpenCL RTC.
+
+The `/gpu/*/shared` backends delegate to the corresponding `/gpu/*/ref` backends.
+These backends use shared memory to improve performance for the {ref}`CeedBasis` kernels.
+All other libCEED objects are delegated to `/gpu/*/ref`.
+These kernels are compiled at runtime via NVRTC, HIPRTC, or OpenCL RTC.
+
+The `/gpu/*/gen` backends delegate to the corresponding `/gpu/*/shared` backends.
+These backends write a single comprehensive kernel to apply the action of the {ref}`CeedOperator`, significantly improving performance by eliminating intermediate data structures and reducing the total number of kernel launches required.
+This kernel is compiled at runtime via NVRTC, HIPRTC, or OpenCL RTC.
+
+The `/gpu/*/magma` backends delegate to the corresponding `/gpu/cuda/ref` and `/gpu/hip/ref` backends.
+These backends provide better performance for {ref}`CeedBasis` kernels but do not have the improvements from the `/gpu/*/gen` backends for {ref}`CeedOperator`.
+
+## Internal Layouts
+
+Ceed backends are free to use any E-vector and Q-vector data layout (including never fully forming these vectors) so long as the backend passes the `t5**` series tests and all examples.
+There are several common layouts for L-vectors, E-vectors, and Q-vectors, detailed below:
+
+- **L-vector** layouts
+
+  - L-vectors described by a standard {ref}`CeedElemRestriction` have a layout described by the `offsets` array and `comp_stride` parameter.
+    Data for node `i`, component `j`, element `k` can be found in the L-vector at index `offsets[i + k*elem_size] + j*comp_stride`.
+  - L-vectors described by a strided {ref}`CeedElemRestriction` have a layout described by the `strides` array.
+    Data for node `i`, component `j`, element `k` can be found in the L-vector at index `i*strides[0] + j*strides[1] + k*strides[2]`.
+
+- **E-vector** layouts
+
+  - If possible, backends should use {c:func}`CeedElemRestrictionSetELayout()` to use the `t2**` tests.
+    If the backend uses a strided E-vector layout, then the data for node `i`, component `j`, element `k` in the E-vector is given by `i*layout[0] + j*layout[1] + k*layout[2]`.
+  - Backends may choose to use a non-strided E-vector layout; however, the `t2**` tests will not function correctly in this case and these tests will need to be marked as allowable failures for this backend in the test suite.
+
+- **Q-vector** layouts
+
+  - When the size of a {ref}`CeedQFunction` field is greater than `1`, data for quadrature point `i` component `j` can be found in the Q-vector at index `i + Q*j`, where `Q` is the total number of quadrature points in the Q-vector.
+    Backends are free to provide the quadrature points in any order.
+  - When the {ref}`CeedQFunction` field has `emode` `CEED_EVAL_GRAD`, data for quadrature point `i`, component `j`, derivative `k` can be found in the Q-vector at index `i + Q*j + Q*num_comp*k`.
+  - Backend developers must take special care to ensure that the data in the Q-vectors for a field with `emode` `CEED_EVAL_NONE` is properly ordered when the backend uses different layouts for E-vectors and Q-vectors.
+
+## CeedVector Array Access
+
+Backend implementations are expected to separately track 'owned' and 'borrowed' memory locations.
+Backends are responsible for freeing 'owned' memory; 'borrowed' memory is set by the user and backends only have read/write access to 'borrowed' memory.
+For any given precision and memory type, a backend should only have 'owned' or 'borrowed' memory, not both.
+
+Backends are responsible for tracking which memory locations contain valid data.
+If the user calls {c:func}`CeedVectorTakeArray` on the only memory location that contains valid data, then the {ref}`CeedVector` is left in an *invalid state*.
+To repair an *invalid state*, the user must set valid data by calling {c:func}`CeedVectorSetValue`, {c:func}`CeedVectorSetArray`, or {c:func}`CeedVectorGetArrayWrite`.
+
+Some checks for consistency and data validity with {ref}`CeedVector` array access are performed at the interface level.
+All backends may assume that array access will conform to these guidelines:
+
+- Borrowed memory
+
+  - {ref}`CeedVector` access to borrowed memory is set with {c:func}`CeedVectorSetArray` with `copy_mode = CEED_USE_POINTER` and revoked with {c:func}`CeedVectorTakeArray`.
+    The user must first call {c:func}`CeedVectorSetArray` with `copy_mode = CEED_USE_POINTER` for the appropriate precision and memory type before calling {c:func}`CeedVectorTakeArray`.
+  - {c:func}`CeedVectorTakeArray` cannot be called on a vector in a *invalid state*.
+
+- Owned memory
+
+  - Owned memory can be allocated by calling {c:func}`CeedVectorSetValue` or by calling {c:func}`CeedVectorSetArray` with `copy_mode = CEED_COPY_VALUES`.
+  - Owned memory can be set by calling {c:func}`CeedVectorSetArray` with `copy_mode = CEED_OWN_POINTER`.
+  - Owned memory can also be allocated by calling {c:func}`CeedVectorGetArrayWrite`.
+    The user is responsible for manually setting the contents of the array in this case.
+
+- Data validity
+
+  - Internal synchronization and user calls to {c:func}`CeedVectorSync` cannot be made on a vector in an *invalid state*.
+  - Calls to {c:func}`CeedVectorGetArray` and {c:func}`CeedVectorGetArrayRead` cannot be made on a vector in an *invalid state*.
+  - Calls to {c:func}`CeedVectorSetArray` and {c:func}`CeedVectorSetValue` can be made on a vector in an *invalid state*.
+  - Calls to {c:func}`CeedVectorGetArrayWrite` can be made on a vector in an *invalid* state.
+    Data synchronization is not required for the memory location returned by {c:func}`CeedVectorGetArrayWrite`.
+    The caller should assume that all data at the memory location returned by {c:func}`CeedVectorGetArrayWrite` is *invalid*.
 
 ## Shape
 
@@ -65,10 +160,10 @@ For example, the comment
 means that it can be traversed as
 
 ```c
-for (d=0; d<dim; d++)
-  for (c=0; c<num_comp; c++)
-    for (q=0; q<Q; q++)
-      for (e=0; e<num_elem; e++)
+for (d = 0; d < dim; d++) {
+  for (c = 0; c < num_comp; c++) {
+    for (q = 0; q < Q; q++) {
+      for (e = 0; e < num_elem; e++) {
         u[((d*num_comp + c)*Q + q)*num_elem + e] = ...
 ```
 
@@ -92,80 +187,111 @@ are purely implicit -- one just indexes the same array using the appropriate con
 QFunction arguments can be assumed to have `restrict` semantics.
 That is, each input and output array must reside in distinct memory without overlap.
 
-## CeedVector Array Access Semantics
+## Style Guide
 
-Backend implementations are expected to separately track 'owned' and 'borrowed' memory locations.
-Backends are responsible for freeing 'owned' memory; 'borrowed' memory is set by the user and backends only have read/write access to 'borrowed' memory.
-For any given precision and memory type, a backend should only have 'owned' or 'borrowed' memory, not both.
+Please check your code for style issues by running
 
-Backends are responsible for tracking which memory locations contain valid data.
-If the user calls {c:func}`CeedVectorTakeArray` on the only memory location that contains valid data, then the {ref}`CeedVector` is left in an *invalid state*.
-To repair an *invalid state*, the user must set valid data by calling {c:func}`CeedVectorSetValue`, {c:func}`CeedVectorSetArray`, or {c:func}`CeedVectorGetArrayWrite`.
+`make format`
 
-Some checks for consistency and data validity with {ref}`CeedVector` array access are performed at the interface level.
-All backends may assume that array access will conform to these guidelines:
+In addition to those automatically enforced style rules, libCEED tends to follow the following code style conventions:
 
-- Borrowed memory
+- Variable names: `snake_case`
+- Strut members: `snake_case`
+- Function and method names: `PascalCase` or language specific style
+- Type names: `PascalCase` or language specific style
+- Constant names: `CAPS_SNAKE_CASE` or language specific style
 
-  - {ref}`CeedVector` access to borrowed memory is set with {c:func}`CeedVectorSetArray` with `copy_mode = CEED_USE_POINTER` and revoked with {c:func}`CeedVectorTakeArray`.
-    The user must first call {c:func}`CeedVectorSetArray` with `copy_mode = CEED_USE_POINTER` for the appropriate precision and memory type before calling {c:func}`CeedVectorTakeArray`.
-  - {c:func}`CeedVectorTakeArray` cannot be called on a vector in a *invalid state*.
+In general, variable and function names should avoid abbreviations and err on the side of verbosity to improve readability.
 
-- Owned memory
+Also, documentation files should have one sentence per line to help make git diffs clearer and less disruptive.
 
-  - Owned memory can be allocated by calling {c:func}`CeedVectorSetValue` or by calling {c:func}`CeedVectorSetArray` with `copy_mode = CEED_COPY_VALUES`.
-  - Owned memory can be set by calling {c:func}`CeedVectorSetArray` with `copy_mode = CEED_OWN_POINTER`.
-  - Owned memory can also be allocated by calling {c:func}`CeedVectorGetArrayWrite`.
-    The user is responsible for manually setting the contents of the array in this case.
+## Function Conventions
 
-- Data validity
+### Naming
+All functions in the libCEED library should be prefixed by `Ceed` and generally take a `Ceed` object as its first argument.
+If a function takes, for example, a `CeedOperator` as its first argument, then it should be prefixed with `CeedOperator`.
 
-  - Internal synchronization and user calls to {c:func}`CeedVectorSync` cannot be made on a vector in an *invalid state*.
-  - Calls to {c:func}`CeedVectorGetArray` and {c:func}`CeedVectorGetArrayRead` cannot be made on a vector in an *invalid state*.
-  - Calls to {c:func}`CeedVectorSetArray` and {c:func}`CeedVectorSetValue` can be made on a vector in an *invalid state*.
-  - Calls to {c:func}`CeedVectorGetArrayWrite` can be made on a vector in an *invalid* state.
-    Data synchronization is not required for the memory location returned by {c:func}`CeedVectorGetArrayWrite`.
-    The caller should assume that all data at the memory location returned by {c:func}`CeedVectorGetArrayWrite` is *invalid*.
+### Style
+Functions should adhere mostly to the PETSc function style, specifically:
 
-## Internal Layouts
+1. All local variables of a particular type (for example, `CeedInt`) should be listed on the same line if possible; otherwise, they should be listed on adjacent lines. For example,
+```c
+// Correct
+CeedInt   a, b, c;
+CeedInt  *d, *e;
+CeedInt **f;
 
-Ceed backends are free to use any **E-vector** and **Q-vector** data layout, to include never fully forming these vectors, so long as the backend passes the `t5**` series tests and all examples.
-There are several common layouts for **L-vectors**, **E-vectors**, and **Q-vectors**, detailed below:
+// Incorrect
+CeedInt a, b, c, *d, *e, **f;
+```
+  
+2. Local variables should be initialized in their declaration when possible.
+3. Nearly all functions should have a return type of `int` and return a `CeedErrorType` to allow for error checking.
+4. All functions must start with a single blank line after the local variable declarations.
+5. All libCEED function calls must have their return value checked for errors using the `CeedCall()` or `CeedCallBackend()` macro.
+   This should be wrapped around the function in question.
+6. In libCEED functions, variables must be declared at the beginning of the code block (C90 style), never mixed in with code.
+   However, when variables are only used in a limited scope, it is encouraged to declare them in that scope.
+7. Do not put a blank line immediately before `return CEED_ERROR_SUCCESS;`.
+8. All libCEED functions must use Doxygen comment blocks before their *definition* (not declaration).
+   The block should begin with `/**` and end with `**/`, each on their own line.
+   The block should be indented by two spaces and should contain an `@brief` tag and description, a newline, a line stating whether the function is collective, a  newline, `@param` tags for each parameter, a newline, and a `@return` line formatted exactly as in the example below.
+   All parameter lines in the Doxygen block should be formatted such that parameter names and descriptions are aligned.
+   There should be a exactly one space between `@param[dir]` (where `dir` is `in`, `out`, or `in,out`) and the parameter name for the closest pair, as well as  between the parameter name and description.
+    For example:
+```c
+/**
+  @brief Initialize a `Ceed` context to use the specified resource.
 
-- **L-vector** layouts
+  Note: Prefixing the resource with "help:" (e.g. "help:/cpu/self") will result in @ref CeedInt() printing the current libCEED version number and a list of current available backend resources to `stderr`.
 
-  - **L-vectors** described by a {ref}`CeedElemRestriction` have a layout described by the `offsets` array and `comp_stride` parameter.
-    Data for node `i`, component `j`, element `k` can be found in the **L-vector** at index `offsets[i + k*elem_size] + j*comp_stride`.
-  - **L-vectors** described by a strided {ref}`CeedElemRestriction` have a layout described by the `strides` array.
-    Data for node `i`, component `j`, element `k` can be found in the **L-vector** at index `i*strides[0] + j*strides[1] + k*strides[2]`.
+  @param[in]  resource Resource to use, e.g., "/cpu/self"
+  @param[out] ceed     The library context
 
-- **E-vector** layouts
+  @return An error code: 0 - success, otherwise - failure
 
-  - If possible, backends should use {c:func}`CeedElemRestrictionSetELayout()` to use the `t2**` tests.
-    If the backend uses a strided **E-vector** layout, then the data for node `i`, component `j`, element `k` in the **E-vector** is given by `i*layout[0] + j*layout[1] + k*layout[2]`.
-  - Backends may choose to use a non-strided **E-vector** layout; however, the `t2**` tests will not function correctly in this case and the tests will need to be whitelisted for the backend to pass the test suite.
+  @ref User
 
-- **Q-vector** layouts
+  @sa CeedRegister() CeedDestroy()
+**/
+int CeedInit(const char *resource, Ceed *ceed) {
+```
+9. Function declarations should include parameter names, which must exactly match those in the function definition.
+10. External functions, i.e. those used in tests or examples, must have their *declarations* prefixed with `CEED_EXTERN`.
+    All other functions should have their *declarations* prefixed with `CEED_INTERN`.
+    Function *definitions* should have neither.
 
-  - When the size of a {ref}`CeedQFunction` field is greater than `1`, data for quadrature point `i` component `j` can be found in the **Q-vector** at index `i + Q*j`.
-    Backends are free to provide the quadrature points in any order.
-  - When the {ref}`CeedQFunction` field has `emode` `CEED_EVAL_GRAD`, data for quadrature point `i`, component `j`, derivative `k` can be found in the **Q-vector** at index `i + Q*j + Q*size*k`.
-  - Note that backend developers must take special care to ensure that the data in the **Q-vectors** for a field with `emode` `CEED_EVAL_NONE` is properly ordered when the backend uses different layouts for **E-vectors** and **Q-vectors**.
+## Clang-tidy
 
-## Backend Inheritance
+Please check your code for common issues by running
 
-There are three mechanisms by which a Ceed backend can inherit implementation from another Ceed backend.
-These options are set in the backend initialization routine.
+`make tidy`
 
-1. Delegation - Developers may use {c:func}`CeedSetDelegate()` to set a backend that will provide the implementation of any unimplemented Ceed objects.
-2. Object delegation  - Developers may use {c:func}`CeedSetObjectDelegate()` to set a backend that will provide the implementation of a specific unimplemented Ceed object.
-   Object delegation has higher precedence than delegation.
-3. Operator fallback - Developers may use {c:func}`CeedSetOperatorFallbackResource()` to set a {ref}`Ceed` resource that will provide the implementation of unimplemented {ref}`CeedOperator` methods.
-   A fallback {ref}`Ceed` with this resource will only be instantiated if a method is called that is not implemented by the parent {ref}`Ceed`.
-   In order to use the fallback mechanism, the parent {ref}`Ceed` and fallback resource must use compatible **E-vector** and **Q-vector** layouts.
+which uses the `clang-tidy` utility included in recent releases of Clang.
+This tool is much slower than actual compilation (`make -j8` parallelism helps).
+To run on a single file, use
 
-For example, the `/cpu/self/xsmm/serial/` backend implements the `CeedTensorContract` object but delegates all other functionality to the `/cpu/self/opt/serial` backend.
-The `/cpu/self/opt/serial` backend implements the `CeedTensorContract` and `CeedOperator` objects but delegates all other functionality to the `/cpu/self/ref/serial` backend.
+`make interface/ceed.c.tidy`
 
-If the `/cpu/self/opt/serial` backend had missing {ref}`CeedOperator` functionality, then it could fallback to `/cpu/self/ref/serial` for missing methods.
-In this case, the fallback {ref}`Ceed` would clone the `/cpu/self/opt/serial` {ref}`CeedOperator` and use this clone to execute the missing functionality.
+for example.
+All issues reported by `make tidy` should be fixed.
+
+## Include-What-You-Use
+
+Header inclusion for source files should follow the principal of 'include what you use' rather than relying upon transitive `#include` to define all symbols.
+
+Every symbol that is used in the source file `foo.c` should be defined in `foo.c`, `foo.h`, or in a header file `#include`d in one of these two locations.
+Please check your code by running the tool [`include-what-you-use`](https://include-what-you-use.org/) to see recommendations for changes to your source.
+Most issues reported by `include-what-you-use` should be fixed; however this rule is flexible to account for differences in header file organization in external libraries.
+If you have `include-what-you-use` installed in a sibling directory to libCEED or set the environment variable `IWYU_CC`, then you can use the makefile target `make iwyu`.
+
+Header files should be listed in alphabetical order, with installed headers preceding local headers and `ceed` headers being listed first.
+The `ceed-f64.h` and `ceed-f32.h` headers should only be included in `ceed.h`.
+
+```c
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <stdbool.h>
+#include <string.h>
+#include "ceed-avx.h"
+```
diff --git a/doc/sphinx/source/releasenotes.md b/doc/sphinx/source/releasenotes.md
index 19300bc7c5..682e76f13b 100644
--- a/doc/sphinx/source/releasenotes.md
+++ b/doc/sphinx/source/releasenotes.md
@@ -11,6 +11,12 @@ On this page we provide a summary of the main API changes, new features and exam
 - Add `bool` field type for `CeedQFunctionContext` and related interfaces to use `bool` fields.
 - `CEED_BASIS_COLLOCATED` removed; users should only use `CEED_BASIS_NONE`.
 - Remove unneeded pointer for `CeedElemRestrictionGetELayout`.
+- Change QFunction source include file handling in JiT compilers
+    - Add `CEED_RUNNING_JIT_PASS` compiler definition for wrapping header files that device JiT compilers cannot read
+    - Users should now prefer `#include <ceed/types.h>` rather than `#include <ceed.h>` in QFunction source files
+- Require use of `Ceed*Destroy()` on Ceed objects returned from `Ceed*Get*()`.
+- Rename `CeedCompositeOperatorCreate()` to `CeedOperatorCreateComposite()` for uniformity.
+- Rename `CeedCompositeOperator*()` to `CeedOperatorComposite*()` for uniformity.
 
 ### New features
 
@@ -18,11 +24,21 @@ On this page we provide a summary of the main API changes, new features and exam
 - Add `CeedElemRestrictionGetLLayout` to provide L-vector layout for strided `CeedElemRestriction` created with `CEED_BACKEND_STRIDES`.
 - Add `CeedVectorReturnCeed` and similar when parent `Ceed` context for a libCEED object is only needed once in a calling scope.
 - Enable `#pragma once` for all JiT source; remove duplicate includes in JiT source string before compilation.
+- Allow user to set additional compiler options for CUDA and HIP JiT.
+Specifically, directories set with `CeedAddJitSourceRoot(ceed, "foo/bar")` will be used to set `-Ifoo/bar` and defines set with `CeedAddJitDefine(ceed, "foo=bar")` will be used to set `-Dfoo=bar`.
+- Added non-tensor basis support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen`.
+- Added support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen` for operators with both tensor and non-tensor bases.
+- Add `CeedGetGitVersion()` to access the Git commit and dirty state of the repository at build time.
+- Add `CeedGetBuildConfiguration()` to access compilers, flags, and related information about the build environment.
 
 ### Examples
 
 - Add deal.II example with CEED BP suite.
 
+### Maintainability
+
+- OCCA backends were retired.
+
 (v0-12)=
 
 ## v0.12 (Oct 31, 2023)
diff --git a/examples/Makefile b/examples/Makefile
index d32f406f5a..4cb4a1ed9e 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -36,6 +36,9 @@ all: ceed mfem nek petsc fluids solids
 ceed:
 	make CEED_DIR=$(CEED_DIR) -C ceed all
 
+dealii:
+	$(RM) -rf deal.II/build
+
 mfem:
 	make CEED_DIR=$(CEED_DIR) MFEM_DIR=$(MFEM_DIR) -C mfem all
 
@@ -53,10 +56,12 @@ fluids:
 
 solids:
 	make CEED_DIR=$(CEED_DIR) PETSC_DIR=$(PETSC_DIR) PETSC_ARCH=$(PETSC_ARCH) -C solids all
-clean:
+
+clean: dealii
 	+make -C ceed clean
 	+make -C mfem clean
 	+make -C nek clean
+	+make -C rust-qfunctions clean
 	+make -C petsc clean
 	+make -C fluids clean
 	+make -C solids clean
diff --git a/examples/README.md b/examples/README.md
index e1177992f0..006d6c0c71 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,11 +9,11 @@ For more details, please see the dedicated [documentation section](https://libce
 
 ## Bakeoff Problems
 
-% bps-inclusion-marker
+<!-- bps-inclusion -->
 
 The Center for Efficient Exascale Discretizations (CEED) uses Bakeoff Problems (BPs) to test and compare the performance of high-order finite element implementations.
 The definitions of the problems are given on the ceed [website](https://ceed.exascaleproject.org/bps/).
-Each of the following bakeoff problems that use external discretization libraries (such as MFEM, PETSc, and Nek5000) are located in the subdirectories `mfem/`, `petsc/`, and `nek5000/`, respectively.
+Each of the following bakeoff problems that use external discretization libraries (such as deal.II, MFEM, PETSc, and Nek5000) are located in the subdirectories `deal.II/`, `mfem/`, `petsc/`, and `nek5000/`, respectively.
 
 Here we provide a short summary:
 
@@ -22,6 +22,13 @@ Here we provide a short summary:
 :widths: auto
 * - User code
   - Supported BPs
+* - `deal.II`
+  - * BP1 (scalar mass operator) with $Q=P+1$
+    * BP2 (vector mass operator) with $Q=P+1$
+    * BP3 (scalar Laplace operator) with $Q=P+1$
+    * BP4 (vector Laplace operator) with $Q=P+1$
+    * BP5 (collocated scalar Laplace operator) with $Q=P$
+    * BP6 (collocated vector Laplace operator) with $Q=P$
 * - `mfem`
   - * BP1 (scalar mass operator) with $Q=P+1$
     * BP3 (scalar Laplace operator) with $Q=P+1$
@@ -46,16 +53,16 @@ The BPs are parametrized by the number $P$ of Gauss-Legendre-Lobatto nodal point
 A $Q$-point Gauss-Legendre quadrature is used for all BPs except BP5 and BP6, which choose $Q = P$ and Gauss-Legendre-Lobatto quadrature to collocate with the interpolation nodes.
 This latter choice is popular in applications that use spectral element methods because it produces a diagonal mass matrix (enabling easy explicit time integration) and significantly reduces the number of floating point operations to apply the operator.
 
-% bps-exclusion-marker
+<!-- bps-exclusion -->
 
 For a more detailed description of the operators employed in the BPs, please see the dedicated [BPs documentation section](https://libceed.org/en/latest/examples/bps.html).
 
-## PETSc+libCEED Navier-Stokes Solver
+## PETSc+libCEED Fluid Dynamics Navier-Stokes Mini-App
 
 The Navier-Stokes problem solves the compressible Navier-Stokes equations using an explicit or implicit time integration.
 A more detailed description of the problem formulation can be found in the [fluids/](./fluids) folder and the corresponding [fluids documentation page](https://libceed.org/en/latest/examples/fluids/index.html).
 
-## PETSc+libCEED Solid mechanics elasticity mini-app
+## PETSc+libCEED Solid Mechanics Elasticity Mini-App
 
 This example solves the steady-state static momentum balance equations using unstructured high-order finite/spectral element spatial discretizations.
 A more detailed description of the problem formulation can be found in the [solids/](./solids) folder and the corresponding [solids documentation page](https://libceed.org/en/latest/examples/solids/index.html).
@@ -70,11 +77,20 @@ For a detailed description, please see the corresponding [area documentation pag
 These examples, located in the [petsc/](./petsc) folder, reproduce the Bakeoff Problems 1-6 on a discrete cubed-sphere, using PETSc.
 For a detailed description, please see the corresponding [problems on the cubed-sphere documentation page](https://libceed.org/en/latest/examples/petsc/index.html#bakeoff-problems-on-the-cubed-sphere).
 
+## libCEED Python Examples
+
+These Jupyter notebooks explore the concepts of the libCEED API, including how to install the Python interface and the usage of each API object, with interactive examples.
+The basic libCEED C examples in `/ceed` folder are also available as Python examples.
+
+## libCEED Rust Examples
+
+The basic libCEED C examples in `/ceed` folder are also available as Rust examples.
+
 ## Running Examples
 
-To build the examples, set the `MFEM_DIR`, `PETSC_DIR`, and `NEK5K_DIR` variables and, from the `examples/` directory, run
+To build the examples, set the `DEAL_II_DIR`, `MFEM_DIR`, `PETSC_DIR`, and `NEK5K_DIR` variables and, from the `examples/` directory, run
 
 ```{include} ../README.md
-:start-after: running-examples-inclusion-marker
-:end-before: benchmarks-marker
+:start-after: <!-- running-examples-inclusion -->
+:end-before: <!-- running-examples-exclusion -->
 ```
diff --git a/examples/bps.md b/examples/bps.md
index 47ba00e80e..7014c71f77 100644
--- a/examples/bps.md
+++ b/examples/bps.md
@@ -3,8 +3,8 @@
 # CEED Bakeoff Problems
 
 ```{include} ./README.md
-:start-after: bps-inclusion-marker
-:end-before: bps-exclusion-marker
+:start-after: <!-- bps-inclusion -->
+:end-before: <!-- bps-exclusion -->
 ```
 
 (mass-operator)=
diff --git a/examples/ceed/.gitignore b/examples/ceed/.gitignore
index 9f00fb96a8..9250d2275b 100644
--- a/examples/ceed/.gitignore
+++ b/examples/ceed/.gitignore
@@ -1,2 +1,3 @@
 ex1-volume
 ex2-surface
+ex3-volume
diff --git a/examples/ceed/Makefile b/examples/ceed/Makefile
index 57528cc1cd..db88064a1e 100644
--- a/examples/ceed/Makefile
+++ b/examples/ceed/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
@@ -9,7 +9,7 @@ OPT ?= -O -g
 
 # Ceed directory
 CEED_DIR ?= ../..
-CEED_FLAGS ?= -I$(CEED_DIR)/include -std=c99 $(OPT)
+CEED_FLAGS ?= -I$(CEED_DIR)/include -std=c11 $(OPT)
 CEED_LIBS ?= -Wl,-rpath,$(abspath $(CEED_DIR)/lib) -L$(CEED_DIR)/lib -lceed -lm
 
 EXAMPLES.c = $(wildcard ex*.c)
diff --git a/examples/ceed/README.md b/examples/ceed/README.md
index 6d4543b2e3..0d6c64188c 100644
--- a/examples/ceed/README.md
+++ b/examples/ceed/README.md
@@ -1,4 +1,4 @@
-## libCEED: Basic Examples
+## libCEED Basic Examples
 
 Two examples are provided that rely only upon libCEED without any external libraries.
 
@@ -9,3 +9,8 @@ This example uses the mass matrix to compute the length, area, or volume of a re
 ### Example 2: ex2-surface
 
 This example uses the diffusion matrix to compute the surface area of a region, in 1D, 2D or 3D, depending upon runtime parameters.
+
+### Example 3: ex3-volume
+
+This example uses the mass matrix to compute the length, area, or volume of a region, depending upon runtime parameters.
+Unlike ex1, this example also adds the diffusion matrix to add a zero contribution to this calculation while demonstrating the ability of libCEED to handle multiple basis evaluation modes on the same input and output vectors.
diff --git a/examples/ceed/ex1-volume-f-c.h b/examples/ceed/ex1-volume-f-c.h
new file mode 100644
index 0000000000..a3316192ff
--- /dev/null
+++ b/examples/ceed/ex1-volume-f-c.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+/// libCEED Q-function for building quadrature data for a mass operator
+CEED_QFUNCTION(build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  long long int *build_data = (long long int *)ctx;
+
+  // in[0] is Jacobians with shape [dim, dim, Q]
+  // in[1] is quadrature weights with shape [1, Q]
+  const CeedScalar *w      = in[1];
+  CeedScalar       *q_data = out[0];
+
+  switch (build_data[0] + 10 * build_data[1]) {
+    case 11: {
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
+      // Quadrature Point Loop
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[0][0][i] * w[i]; }  // End of Quadrature Point Loop
+    } break;
+    case 22: {
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
+      // Quadrature Point Loop
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        q_data[i] = (J[0][0][i] * J[1][1][i] - J[0][1][i] * J[1][0][i]) * w[i];
+      }  // End of Quadrature Point Loop
+    } break;
+    case 33: {
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
+      // Quadrature Point Loop
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        q_data[i] =
+            (J[0][0][i] * (J[1][1][i] * J[2][2][i] - J[1][2][i] * J[2][1][i]) - J[0][1][i] * (J[1][0][i] * J[2][2][i] - J[1][2][i] * J[2][0][i]) +
+             J[0][2][i] * (J[1][0][i] * J[2][1][i] - J[1][1][i] * J[2][0][i])) *
+            w[i];
+      }  // End of Quadrature Point Loop
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/// libCEED Q-function for applying a mass operator
+CEED_QFUNCTION(apply_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0], out[0] are solution variables with shape [1, Q]
+  // in[1] is quadrature data with shape [1, Q]
+  const CeedScalar *u = in[0], *q_data = in[1];
+  CeedScalar       *v = out[0];
+
+  // Quadrature Point Loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = q_data[i] * u[i]; }  // End of Quadrature Point Loop
+  return CEED_ERROR_SUCCESS;
+}
diff --git a/examples/ceed/ex1-volume-f.f90 b/examples/ceed/ex1-volume-f.f90
new file mode 100644
index 0000000000..580874efc2
--- /dev/null
+++ b/examples/ceed/ex1-volume-f.f90
@@ -0,0 +1,557 @@
+! Copyright (c) 2017-2026,  Lawrence Livermore National Security,  LLC and other CEED contributors.
+! All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+
+! SPDX-License-Identifier: BSD-2-Clause
+
+! This file is part of CEED:  http:Cgithub.com/ceed
+
+! libCEED Example 1
+
+! This example illustrates a simple usage of libCEED to compute the volume of a 3D body using matrix-free application of a mass operator.
+! Arbitrary mesh and solution degrees in 1D,  2D and 3D are supported from the same code.
+
+! The example has no dependencies,  and is designed to be self-contained.
+! For additional examples that use external discretization libraries (MFEM,  PETSc,  etc.) see the subdirectories in libceed/examples.
+
+! All libCEED objects use a Ceed device object constructed based on a command line argument (-ceed).
+
+! Build with:
+
+!     make ex1-volume [CEED_DIR = </path/to/libceed>]
+
+! Sample runs:
+
+!     ./ex1-volume-f
+!     ./ex1-volume-f -ceed /cpu/self
+!     ./ex1-volume-f -ceed /gpu/cuda
+
+! Test in 1D-3D
+! TESTARGS(name = "1D User QFunction") -ceed {ceed_resource} -d 1 -t
+! TESTARGS(name = "2D User QFunction") -ceed {ceed_resource} -d 2 -t
+! TESTARGS(name = "3D User QFunction") -ceed {ceed_resource} -d 3 -t
+! TESTARGS(name = "1D Gallery QFunction") -ceed {ceed_resource} -d 1 -t -g
+! TESTARGS(name = "2D Gallery QFunction") -ceed {ceed_resource} -d 2 -t -g
+! TESTARGS(name = "3D Gallery QFunction") -ceed {ceed_resource} -d 3 -t -g
+
+!> @file
+!> libCEED example using mass operator to compute volume
+
+    include 'ex1-volume-f.h'
+
+!-----------------------------------------------------------------------
+subroutine getcartesianmeshsize(fe_dim, degree, prob_size, num_xyz)
+    implicit none
+    integer fe_dim
+    integer degree
+    integer prob_size
+    integer num_xyz(3)
+
+    integer num_elem
+    integer s, r, d, sd
+    num_elem = prob_size/(degree**fe_dim)
+    s = 0
+
+! Use the approximate formula:
+!    prob_size ~ num_elem * degree^dim
+! find s: num_elem/2 < 2^s <= num_elem
+
+  do while (num_elem > 1)
+    num_elem = num_elem/2
+    s = s + 1
+  end do
+  r = mod(s, fe_dim)
+
+  do d = 1, fe_dim
+    sd = s/fe_dim
+    if (r > 0) then
+      sd = sd + 1
+      r = r - 1
+    end if
+    num_xyz(d) = ISHFT(1, sd)
+  end do
+end
+
+!-----------------------------------------------------------------------
+subroutine buildcartesianrestriction(ceed, fe_dim, num_xyz, degree, num_comp, mesh_size, num_qpts, restriction,&
+&     q_data_restriction, err)
+    implicit none
+    include 'ceed/fortran.h'
+
+    integer ceed
+    integer fe_dim
+    integer num_xyz(3)
+    integer degree
+    integer num_comp
+    integer mesh_size
+    integer num_qpts
+    integer restriction
+    integer q_data_restriction
+    integer err
+
+    integer p
+    integer num_nodes
+    integer elem_qpts
+    integer num_elem
+    integer scalar_size
+    integer nd(3)
+    integer elem_nodes_size
+    integer e_xyz(3),  re
+    integer g_nodes, g_nodes_stride, r_nodes
+    integer, dimension (:), allocatable :: elem_nodes
+
+    integer i, j, k
+
+    p = degree + 1
+    num_nodes = p**fe_dim
+    elem_qpts = num_qpts**fe_dim
+    num_elem  = 1
+    scalar_size = 1
+
+    do i = 1, fe_dim
+      num_elem = num_elem * num_xyz(i)
+      nd(i) = num_xyz(i) * (p - 1) + 1
+      scalar_size = scalar_size*nd(i)
+    end do
+    mesh_size = scalar_size*num_comp
+! elem:       0         1             n-1
+!         |---*-...-*---|---*-...-*---|- ... -|--...--|
+! num_nodes:   0   1    p-1  p  p+1     2*p         n*p
+    elem_nodes_size = num_elem*num_nodes
+    allocate (elem_nodes(elem_nodes_size))
+
+    do i = 1, num_elem
+      e_xyz(1) = 1
+      e_xyz(2) = 1
+      e_xyz(3) = 1
+      re = i - 1
+
+      do j = 1, fe_dim
+        e_xyz(j) = mod(re, num_xyz(j))
+        re = re/num_xyz(j)
+      end do
+
+      do j = 1, num_nodes
+        g_nodes = 0
+        g_nodes_stride = 1
+        r_nodes = j - 1
+
+        do k = 1, fe_dim
+          g_nodes = g_nodes + (e_xyz(k) * (p - 1) + mod(r_nodes, p)) * g_nodes_stride
+          g_nodes_stride = g_nodes_stride * nd(k)
+          r_nodes = r_nodes/p
+        end do
+        elem_nodes((i - 1) * num_nodes + j) = g_nodes
+      end do
+    end do
+
+    call ceedelemrestrictioncreate(ceed, num_elem, num_nodes, num_comp, scalar_size, mesh_size, ceed_mem_host,&
+             &ceed_copy_values, elem_nodes, restriction, err)
+    if (q_data_restriction /=  ceed_qfunction_none) then
+      call ceedelemrestrictioncreatestrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem,&
+               &ceed_strides_backend, q_data_restriction, err)
+    end if
+    deallocate (elem_nodes)
+end
+
+!-----------------------------------------------------------------------
+subroutine transformmeshcoords(fe_dim, mesh_size, coords, exact_volume, err)
+    implicit none
+
+    integer fe_dim
+    integer mesh_size, scalar_size
+    real*8 coords(mesh_size)
+    real*8 exact_volume
+    real*8 m_pi, m_pi_2
+    parameter(m_pi = 3.14159265358979323846d0)
+    parameter(m_pi_2 = 1.57079632679489661923d0)
+    integer err
+
+    integer i
+    real*8 u, v
+
+    scalar_size = mesh_size/fe_dim
+    select case (fe_dim)
+    case (1)
+      do i = 1, scalar_size
+        coords(i) = 0.5d0 + (1.d0/sqrt(3.d0)) * sin((2.d0/3.d0) * m_pi * (coords(i) - 0.5d0))
+      end do
+      exact_volume = 1.d0
+
+    case (2,  3)
+      do i = 1, scalar_size
+        u = 1.d0 + coords(i)
+        v = m_pi_2 * coords(i + scalar_size)
+
+        coords(i)               = u * cos(v)
+        coords(i + scalar_size) = u * sin(v)
+      end do
+      exact_volume = 3.d0/4.d0 * m_pi
+    end select
+end
+
+!-----------------------------------------------------------------------
+subroutine setcartesianmeshcoords(fe_dim, num_xyz, mesh_degree, mesh_coords, exact_volume, err)
+    implicit none
+    include 'ceed/fortran.h'
+
+    integer fe_dim
+    integer num_xyz(3)
+    integer mesh_degree
+    integer mesh_coords
+    real*8 exact_volume
+    integer err
+
+    integer p
+    integer scalar_size
+    integer coords_size
+    integer r_nodes
+    integer d_1d
+    integer nd(3)
+    real*8, dimension (:), allocatable :: nodes,  qpts
+    real*8, dimension (:), allocatable :: coords
+    integer*8 offset
+    integer i, j
+    p = mesh_degree + 1
+    scalar_size = 1
+
+    do i = 1, fe_dim
+      nd(i) = num_xyz(i) * (p - 1) + 1
+      scalar_size = scalar_size * nd(i)
+    end do
+
+    coords_size = scalar_size * fe_dim
+    allocate (coords(coords_size))
+
+! The H1 basis uses Lobatto quadrature points as nodes
+    allocate (nodes(p))
+    allocate (qpts(p))
+    call ceedlobattoquadrature(p, nodes, qpts, err)
+    deallocate(qpts)
+    do i = 1, p
+      nodes(i) = 0.5 + 0.5 * nodes(i)
+    end do
+
+    do i = 1, scalar_size
+      r_nodes = i - 1
+
+      do j = 1, fe_dim
+        d_1d  =  mod(r_nodes, nd(j))
+        coords(scalar_size * (j - 1) + i) = ((d_1d/(p - 1)) + nodes(mod(d_1d, p - 1) + 1))/num_xyz(j)
+        r_nodes = r_nodes/nd(j)
+      end do
+    end do
+    deallocate(nodes)
+
+    call transformmeshcoords(fe_dim, coords_size, coords, exact_volume, err)
+
+    offset = 0
+    call ceedvectorsetarray(mesh_coords, ceed_mem_host, ceed_copy_values, coords, offset, err)
+    deallocate(coords)
+end
+
+!-----------------------------------------------------------------------
+program main
+    implicit none
+    include 'ceed/fortran.h'
+
+    character ceed_spec*32
+    integer fe_dim, num_comp_x, mesh_degree, sol_degree, num_qpts
+    integer num_elem, num_xyz(3), elem_qpts
+    integer prob_size, mesh_size, sol_size
+    integer help, test, gallery, benchmark
+    integer i, num_args, err
+    character arg*32, arg_value*32
+    real*8 exact_volume, computed_volume
+
+    integer ceed
+    real*8, dimension (:), allocatable :: u_array, v_array
+    integer mesh_coords, q_data, u, v
+    integer mesh_restriction, sol_restriction, q_data_restriction
+    integer mesh_basis, sol_basis
+    integer*8 offset
+    integer build_ctx
+    integer build_ctx_size
+    parameter(build_ctx_size = 2)
+    integer*8 build_ctx_data(build_ctx_size)
+    integer qf_build, qf_apply
+    integer op_build, op_apply
+
+    external build_mass, apply_mass
+
+! Initial values
+    ceed_spec   = '/cpu/self'
+    fe_dim      = 3
+    num_comp_x  = 3
+    mesh_degree = 4
+    sol_degree  = 4
+    num_qpts    = mesh_degree + 2
+    prob_size   = -1
+    help      = 0
+    test      = 0
+    gallery   = 0
+    benchmark = 0
+
+! Process command line arguments
+   
+    num_args = command_argument_count()
+    do i = 1, num_args
+      call get_command_argument(i, arg)
+
+      select case (arg)
+! LCOV_EXCL_START
+        case ('-h')
+          help = 1
+
+        case ('-c',  '-ceed')
+          call get_command_argument(i + 1, ceed_spec)
+
+        case ('-d')
+          call get_command_argument(i + 1, arg_value)
+          read(arg_value, '(I10)') fe_dim
+          num_comp_x = fe_dim
+
+        case ('-m')
+          call get_command_argument(i + 1, arg_value)
+          read(arg_value, '(I10)') mesh_degree
+
+        case ('-p')
+          call get_command_argument(i + 1, arg_value)
+          read(arg_value, '(I10)') sol_degree
+
+        case ('-q')
+          call get_command_argument(i + 1, arg_value)
+          read(arg_value, '(I10)') num_qpts
+
+        case ('-s')
+          call get_command_argument(i + 1, arg_value)
+          read(arg_value, '(I10)') prob_size
+
+        case ('-b')
+          call get_command_argument(i + 1, arg_value)
+          read(arg_value, '(I10)') benchmark
+! LCOV_EXCL_STOP
+
+        case ('-t')
+          test = 1
+
+        case ('-g')
+          gallery = 1
+      end select
+    end do
+
+    if (prob_size < 0) then
+      if (test == 1) then
+        prob_size = 8 * 16
+      else
+        prob_size = 256 * 1024
+      end if
+    end if
+
+! Print options
+    if ((test /= 1) .OR. (help == 1)) then
+! LCOV_EXCL_START
+    write (*, *) 'Selected options: [command line option] : <current value>'
+    write (*, *) '  Ceed specification     [-c] : ', ceed_spec
+    write (*, *) '  Mesh dimension         [-d] : ', fe_dim
+    write (*, *) '  Mesh degree            [-m] : ', mesh_degree
+    write (*, *) '  Solution degree        [-p] : ', sol_degree
+    write (*, *) '  Num. 1D quadrature pts [-q] : ', num_qpts
+    write (*, *) '  Approx. # unknowns     [-s] : ', prob_size
+    if (gallery == 1) then
+      write (*, *) '  QFunction source       [-g] : gallery'
+    else
+      write (*, *) '  QFunction source       [-g] : header'
+    end if
+    if (help == 1) then
+      if (test == 0) then
+        write (*, *) 'Test/quiet mode is OFF (use -t to enable)'
+      else
+        write (*, *) 'Test/quiet mode is ON'
+      end if
+    end if
+! LCOV_EXCL_STOP
+    end if
+
+! Select appropriate backend and logical device based on the (-ceed) command line argument
+    call ceedinit(trim(ceed_spec)//char(0), ceed, err)
+
+! Construct the mesh and solution bases
+    call ceedbasiscreatetensorh1lagrange(ceed, fe_dim, num_comp_x, mesh_degree + 1, num_qpts, ceed_gauss, mesh_basis,&
+             &err)
+    call ceedbasiscreatetensorh1lagrange(ceed, fe_dim, 1, sol_degree + 1, num_qpts, ceed_gauss, sol_basis, err)
+
+! Determine the mesh size based on the given approximate problem size
+    call getcartesianmeshsize(fe_dim, sol_degree, prob_size, num_xyz)
+    if (test == 0) then
+! LCOV_EXCL_START
+    write (*, '(A16, I8)', advance='no') 'Mesh size: nx = ', num_xyz(1)
+    if (num_comp_x > 1) then
+      write (*, '(A7, I8)', advance='no') ',  ny = ', num_xyz(2)
+    end if
+    if (num_comp_x > 2) then
+      write (*, '(A7, I8)', advance='no') ',  nz = ', num_xyz(3)
+    end if
+    write (*, *)
+! LCOV_EXCL_STOP
+    endif
+
+! Build CeedElemRestriction objects describing the mesh and solution discrete representation
+    call buildcartesianrestriction(ceed, fe_dim, num_xyz, mesh_degree, num_comp_x, mesh_size, num_qpts,&
+             &mesh_restriction, ceed_qfunction_none, err)
+    call buildcartesianrestriction(ceed, fe_dim, num_xyz, sol_degree, 1, sol_size, num_qpts, sol_restriction,&
+             &q_data_restriction, err)
+
+    if (test == 0) then
+! LCOV_EXCL_START
+      write (*, *) 'Number of mesh nodes     : ', mesh_size/fe_dim
+      write (*, *) 'Number of solution nodes : ', sol_size
+! LCOV_EXCL_STOP
+    end if
+
+! Create a CeedVector with the mesh coordinates
+! Apply a transformation to the mesh
+    call ceedvectorcreate(ceed, mesh_size, mesh_coords, err)
+    call setcartesianmeshcoords(fe_dim, num_xyz, mesh_degree, mesh_coords, exact_volume, err)
+
+! Context data to be passed to the 'build_mass' QFunction
+    build_ctx_data(1) = fe_dim
+    build_ctx_data(2) = num_comp_x
+    call ceedqfunctioncontextcreate(ceed, build_ctx, err)
+! Note: The context technically only takes arrays of double precision values, but we can pass arrays of ints of the same length
+    offset = 0
+    call ceedqfunctioncontextsetdata(build_ctx, ceed_mem_host, ceed_use_pointer, build_ctx_size, build_ctx_data,&
+             &offset, err)
+
+! Create the QFunction that builds the mass operator (i.e. computes its quadrature data) and set its context data
+    if (gallery == 1) then
+      select case (fe_dim)
+        case (1)
+          call ceedqfunctioncreateinteriorbyname(ceed, 'Mass1DBuild', qf_build, err)
+
+        case (2)
+          call ceedqfunctioncreateinteriorbyname(ceed, 'Mass2DBuild', qf_build, err)
+
+        case (3)
+          call ceedqfunctioncreateinteriorbyname(ceed, 'Mass3DBuild', qf_build, err)
+      end select
+    else
+      call ceedqfunctioncreateinterior(ceed, 1, build_mass,&
+               &SOURCE_DIR&
+               &//'ex1-volume-f-c.h:build_mass'//char(0), qf_build, err)
+      call ceedqfunctionaddinput(qf_build, 'dx', num_comp_x * fe_dim, ceed_eval_grad, err)
+      call ceedqfunctionaddinput(qf_build, 'weights', 1, ceed_eval_weight, err)
+      call ceedqfunctionaddoutput(qf_build, 'qdata', 1, ceed_eval_none, err)
+      call ceedqfunctionsetcontext(qf_build, build_ctx, err)
+    end if
+
+! Create the operator that builds the quadrature data for the mass operator
+    call ceedoperatorcreate(ceed, qf_build, ceed_qfunction_none, ceed_qfunction_none, op_build, err)
+    call ceedoperatorsetfield(op_build, 'dx', mesh_restriction, mesh_basis, ceed_vector_active, err)
+    call ceedoperatorsetfield(op_build, 'weights', ceed_elemrestriction_none, mesh_basis, ceed_vector_none, err)
+    call ceedoperatorsetfield(op_build, 'qdata', q_data_restriction, ceed_basis_none, ceed_vector_active, err)
+
+! Compute the quadrature data for the mass operator
+    num_elem  = 1
+    elem_qpts = num_qpts**fe_dim
+    do i = 1, fe_dim
+      num_elem = num_elem * num_xyz(i)
+    end do
+    call ceedvectorcreate(ceed, num_elem * elem_qpts, q_data, err)
+    call ceedoperatorapply(op_build, mesh_coords, q_data, ceed_request_immediate, err)
+
+! Create the QFunction that defines the action of the mass operator
+    if (gallery == 1) then
+      call ceedqfunctioncreateinteriorbyname(ceed, 'MassApply', qf_apply, err)
+    else
+      call ceedqfunctioncreateinterior(ceed, 1, apply_mass,&
+               &SOURCE_DIR&
+               &//'ex1-volume-f-c.h:apply_mass'//char(0), qf_apply, err)
+      call ceedqfunctionaddinput(qf_apply, 'u', 1, ceed_eval_interp, err)
+      call ceedqfunctionaddinput(qf_apply, 'qdata', 1, ceed_eval_none, err)
+      call ceedqfunctionaddoutput(qf_apply, 'v', 1, ceed_eval_interp, err)
+    end if
+
+! Create the mass operator
+    call ceedoperatorcreate(ceed, qf_apply, ceed_qfunction_none, ceed_qfunction_none, op_apply, err)
+    call ceedoperatorsetfield(op_apply, 'u', sol_restriction, sol_basis, ceed_vector_active, err)
+    call ceedoperatorsetfield(op_apply, 'qdata', q_data_restriction, ceed_basis_none, q_data, err)
+    call ceedoperatorsetfield(op_apply, 'v', sol_restriction, sol_basis, ceed_vector_active, err)
+
+! Create auxiliary solution-size vectors
+    allocate (u_array(sol_size))
+    allocate (v_array(sol_size))
+
+    call ceedvectorcreate(ceed, sol_size, u, err)
+    offset = 0
+    call ceedvectorsetarray(u, ceed_mem_host, ceed_use_pointer, u_array, offset, err)
+    call ceedvectorcreate(ceed, sol_size, v, err)
+    offset = 0
+    call ceedvectorsetarray(v, ceed_mem_host, ceed_use_pointer, v_array, offset, err)
+
+! Initialize 'u' with ones
+    call ceedvectorsetvalue(u, 1.d0, err)
+
+! Compute the mesh volume using the mass operator: volume = 1^T \cdot M \cdot 1
+    call ceedoperatorapply(op_apply, u, v, ceed_request_immediate, err)
+
+! Benchmark runs
+    if (test /= 1 .AND. benchmark /= 0) then
+! LCOV_EXCL_START
+      write (*, *) ' Executing ', benchmark, ' benchmarking runs...'
+! LCOV_EXCL_STOP
+    end if
+    do i = 1, benchmark
+! LCOV_EXCL_START
+      call ceedoperatorapply(op_apply, u, v, ceed_request_immediate, err)
+! LCOV_EXCL_STOP
+    end do
+
+! Compute and print the sum of the entries of 'v' giving the mesh volume
+    computed_volume = 0.d0
+
+    call ceedvectorgetarrayread(v, ceed_mem_host, v_array, offset, err)
+    do i = 1, sol_size
+      computed_volume = computed_volume + v_array(offset + i)
+    end do
+    call ceedvectorrestorearrayread(v, v_array, offset, err)
+
+    if (test /= 1) then
+! LCOV_EXCL_START
+      write (*, *) ' done.'
+      write (*, *) 'Exact mesh volume    :', exact_volume
+      write (*, *) 'Computed mesh volume :', computed_volume
+      write (*, *) 'Volume error         :', (exact_volume - computed_volume)
+! LCOV_EXCL_STOP
+    else
+      if (fe_dim == 1) then
+        if (abs(exact_volume - computed_volume) > 200.d0 * 1e-15) then
+! LCOV_EXCL_START
+          write (*, *) 'Volume error : ', (exact_volume - computed_volume)
+! LCOV_EXCL_STOP
+        end if
+      else
+        if (abs(exact_volume - computed_volume) > 1e-5) then
+! LCOV_EXCL_START
+          write (*, *) 'Volume error : ', (exact_volume - computed_volume)
+! LCOV_EXCL_STOP
+        end if
+      end if
+    end if
+
+! Free dynamically allocated memory
+    call ceedvectordestroy(mesh_coords, err)
+    call ceedvectordestroy(q_data, err)
+    call ceedvectordestroy(u, err)
+    call ceedvectordestroy(v, err)
+    deallocate (u_array)
+    deallocate (v_array)
+    call ceedbasisdestroy(sol_basis, err)
+    call ceedbasisdestroy(mesh_basis, err)
+    call ceedqfunctioncontextdestroy(build_ctx, err)
+    call ceedqfunctiondestroy(qf_build, err)
+    call ceedqfunctiondestroy(qf_apply, err)
+    call ceedoperatordestroy(op_build, err)
+    call ceedoperatordestroy(op_apply, err)
+    call ceeddestroy(ceed, err)
+end
+!-----------------------------------------------------------------------
diff --git a/examples/ceed/ex1-volume-f.h b/examples/ceed/ex1-volume-f.h
new file mode 100644
index 0000000000..08ea68ef6f
--- /dev/null
+++ b/examples/ceed/ex1-volume-f.h
@@ -0,0 +1,55 @@
+!-----------------------------------------------------------------------
+subroutine build_mass(ctx, q, j, w, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15, u16,&
+    qdata, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, ierr)
+      integer*8 ctx(2)
+      integer*8 fe_dim, space_dim
+! j is Jacobians with shape [dim,  dim, Q]
+! w is quadrature weights with shape [1, Q]
+      real*8 j(1)
+      real*8 w(1)
+! qdata is quadrature data with shape [1, Q]
+      real*8 qdata(1)
+      integer q, ierr
+
+      fe_dim = ctx(1)
+      space_dim = ctx(2)
+
+      select case (fe_dim + 10*space_dim)
+        case (11)
+          do i = 1, q
+            qdata(i) = j(i) * w(i)
+          end do
+
+        case (22)
+          do i = 1, q
+            qdata(i) = (j(0*q + i)*j(3*q + i) - j(1*q + i)*j(2*q + i)) * w(i)
+          end do
+
+        case (33)
+          do i = 1, q
+            qdata(i) = (j(0*q + i) * (j(4*q + i)*j(8*q + i) - j(5*q + i)*j(7*q + i)) -&
+                       &j(1*q + i) * (j(3*q + i)*j(8*q + i) - j(5*q + i)*j(6*q + i)) +&
+                       &j(2*q + i) * (j(3*q + i)*j(7*q + i) - j(4*q + i)*j(6*q + i))) * w(i)
+          end do
+      end select
+      ierr = 0
+end
+
+!-----------------------------------------------------------------------
+subroutine apply_mass(ctx, q, u, qdata, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15, u16,&
+    v, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, ierr)
+      integer*8 ctx
+! u is solution variables with shape [1, Q]
+! qdata is quadrature data with shape [1, Q]
+      real*8 u(1)
+      real*8 qdata(1)
+! v is solution variables with shape [1, Q]
+      real*8 v(1)
+      integer q, ierr
+
+      do i = 1, q
+        v(i) = qdata(i) * u(i)
+      end do
+      ierr = 0
+end
+!-----------------------------------------------------------------------
diff --git a/examples/ceed/ex1-volume.c b/examples/ceed/ex1-volume.c
index 04852f28c1..354f977113 100644
--- a/examples/ceed/ex1-volume.c
+++ b/examples/ceed/ex1-volume.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -60,7 +60,7 @@ int main(int argc, const char *argv[]) {
   CeedInt     sol_degree  = 4;               // polynomial degree for the solution
   CeedInt     num_qpts    = sol_degree + 2;  // number of 1D quadrature points
   CeedInt     prob_size   = -1;              // approximate problem size
-  CeedInt     help = 0, test = 0, gallery = 0;
+  CeedInt     help = 0, test = 0, gallery = 0, benchmark = 0;
 
   // Process command line arguments.
   for (int ia = 1; ia < argc; ia++) {
@@ -81,6 +81,8 @@ int main(int argc, const char *argv[]) {
       parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1;
     } else if (!strcmp(argv[ia], "-s")) {
       parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-b")) {
+      parse_error = next_arg ? benchmark = atoi(argv[++ia]), 0 : 1;
     } else if (!strcmp(argv[ia], "-t")) {
       test = 1;
     } else if (!strcmp(argv[ia], "-g")) {
@@ -115,15 +117,18 @@ int main(int argc, const char *argv[]) {
 
   // Select appropriate backend and logical device based on the (-ceed) command line argument.
   Ceed ceed;
+
   CeedInit(ceed_spec, &ceed);
 
   // Construct the mesh and solution bases.
   CeedBasis mesh_basis, sol_basis;
+
   CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis);
   CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis);
 
   // Determine the mesh size based on the given approximate problem size.
   CeedInt num_xyz[dim];
+
   GetCartesianMeshSize(dim, sol_degree, prob_size, num_xyz);
   if (!test) {
     // LCOV_EXCL_START
@@ -137,6 +142,7 @@ int main(int argc, const char *argv[]) {
   // Build CeedElemRestriction objects describing the mesh and solution discrete representations.
   CeedInt             mesh_size, sol_size;
   CeedElemRestriction mesh_restriction, sol_restriction, q_data_restriction;
+
   BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restriction, NULL);
   BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restriction, &q_data_restriction);
   if (!test) {
@@ -148,6 +154,7 @@ int main(int argc, const char *argv[]) {
 
   // Create a CeedVector with the mesh coordinates.
   CeedVector mesh_coords;
+
   CeedVectorCreate(ceed, mesh_size, &mesh_coords);
   SetCartesianMeshCoords(dim, num_xyz, mesh_degree, mesh_coords);
 
@@ -157,12 +164,14 @@ int main(int argc, const char *argv[]) {
   // Context data to be passed to the 'build_mass' QFunction.
   CeedQFunctionContext build_ctx;
   struct BuildContext  build_ctx_data;
+
   build_ctx_data.dim = build_ctx_data.space_dim = dim;
   CeedQFunctionContextCreate(ceed, &build_ctx);
   CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
 
   // Create the QFunction that builds the mass operator (i.e. computes its quadrature data) and set its context data.
   CeedQFunction qf_build;
+
   if (gallery) {
     // This creates the QFunction via the gallery.
     char name[13] = "";
@@ -179,6 +188,7 @@ int main(int argc, const char *argv[]) {
 
   // Create the operator that builds the quadrature data for the mass operator.
   CeedOperator op_build;
+
   CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
   CeedOperatorSetField(op_build, "dx", mesh_restriction, mesh_basis, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE);
@@ -188,12 +198,14 @@ int main(int argc, const char *argv[]) {
   CeedVector q_data;
   CeedInt    elem_qpts = CeedIntPow(num_qpts, dim);
   CeedInt    num_elem  = 1;
+
   for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d];
   CeedVectorCreate(ceed, num_elem * elem_qpts, &q_data);
   CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE);
 
   // Create the QFunction that defines the action of the mass operator.
   CeedQFunction qf_apply;
+
   if (gallery) {
     // This creates the QFunction via the gallery.
     CeedQFunctionCreateInteriorByName(ceed, "MassApply", &qf_apply);
@@ -207,6 +219,7 @@ int main(int argc, const char *argv[]) {
 
   // Create the mass operator.
   CeedOperator op_apply;
+
   CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
   CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data);
@@ -214,6 +227,7 @@ int main(int argc, const char *argv[]) {
 
   // Create auxiliary solution-size vectors.
   CeedVector u, v;
+
   CeedVectorCreate(ceed, sol_size, &u);
   CeedVectorCreate(ceed, sol_size, &v);
 
@@ -223,10 +237,24 @@ int main(int argc, const char *argv[]) {
   // Compute the mesh volume using the mass operator: volume = 1^T \cdot M \cdot 1
   CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
 
+  // Benchmark runs
+  if (!test && benchmark) {
+    // LCOV_EXCL_START
+    printf(" Executing %d benchmarking runs...\n", benchmark);
+    // LCOV_EXCL_STOP
+  }
+  for (CeedInt i = 0; i < benchmark; i++) {
+    // LCOV_EXCL_START
+    CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
+    // LCOV_EXCL_STOP
+  }
+
   // Compute and print the sum of the entries of 'v' giving the mesh volume.
   CeedScalar volume = 0.;
+
   {
     const CeedScalar *v_array;
+
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < sol_size; i++) volume += v_array[i];
     CeedVectorRestoreArrayRead(v, &v_array);
@@ -240,6 +268,7 @@ int main(int argc, const char *argv[]) {
     // LCOV_EXCL_STOP
   } else {
     CeedScalar tol = (dim == 1 ? 200. * CEED_EPSILON : dim == 2 ? 1E-5 : 1E-5);
+
     if (fabs(volume - exact_volume) > tol) printf("Volume error : % .1e\n", volume - exact_volume);
   }
 
@@ -267,13 +296,16 @@ int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt
   //    prob_size ~ num_elem * degree^dim
   CeedInt num_elem = prob_size / CeedIntPow(degree, dim);
   CeedInt s        = 0;  // find s: num_elem/2 < 2^s <= num_elem
+
   while (num_elem > 1) {
     num_elem /= 2;
     s++;
   }
   CeedInt r = s % dim;
+
   for (CeedInt d = 0; d < dim; d++) {
     CeedInt sd = s / dim;
+
     if (r > 0) {
       sd++;
       r--;
@@ -289,6 +321,7 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], Ceed
   CeedInt num_nodes = CeedIntPow(p, dim);         // number of scalar nodes per element
   CeedInt elem_qpts = CeedIntPow(num_qpts, dim);  // number of qpts per element
   CeedInt nd[3], num_elem = 1, scalar_size = 1;
+
   for (CeedInt d = 0; d < dim; d++) {
     num_elem *= num_xyz[d];
     nd[d] = num_xyz[d] * (p - 1) + 1;
@@ -299,15 +332,19 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], Ceed
   //           |---*-...-*---|---*-...-*---|- ... -|--...--|
   // num_nodes:   0   1    p-1  p  p+1       2*p             n*p
   CeedInt *elem_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes);
+
   for (CeedInt e = 0; e < num_elem; e++) {
     CeedInt e_xyz[3] = {1, 1, 1}, re = e;
+
     for (CeedInt d = 0; d < dim; d++) {
       e_xyz[d] = re % num_xyz[d];
       re /= num_xyz[d];
     }
     CeedInt *local_elem_nodes = elem_nodes + e * num_nodes;
+
     for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) {
       CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes;
+
       for (CeedInt d = 0; d < dim; d++) {
         g_nodes += (e_xyz[d] * (p - 1) + r_nodes % p) * g_nodes_stride;
         g_nodes_stride *= nd[d];
@@ -318,8 +355,9 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], Ceed
   }
   CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, elem_nodes,
                             restriction);
-  if (q_data_restriction)
+  if (q_data_restriction) {
     CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, CEED_STRIDES_BACKEND, q_data_restriction);
+  }
   free(elem_nodes);
   return 0;
 }
@@ -327,20 +365,25 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], Ceed
 int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords) {
   CeedInt p = mesh_degree + 1;
   CeedInt nd[3], scalar_size = 1;
+
   for (CeedInt d = 0; d < dim; d++) {
     nd[d] = num_xyz[d] * (p - 1) + 1;
     scalar_size *= nd[d];
   }
   CeedScalar *coords;
+
   CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords);
   CeedScalar *nodes = malloc(sizeof(CeedScalar) * p);
+
   // The H1 basis uses Lobatto quadrature points as nodes.
   CeedLobattoQuadrature(p, nodes, NULL);  // nodes are in [-1,1]
   for (CeedInt i = 0; i < p; i++) nodes[i] = 0.5 + 0.5 * nodes[i];
   for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) {
     CeedInt r_nodes = gs_nodes;
+
     for (CeedInt d = 0; d < dim; d++) {
-      CeedInt d_1d                       = r_nodes % nd[d];
+      CeedInt d_1d = r_nodes % nd[d];
+
       coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) + nodes[d_1d % (p - 1)]) / num_xyz[d];
       r_nodes /= nd[d];
     }
@@ -358,6 +401,7 @@ int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degre
 CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords) {
   CeedScalar  exact_volume;
   CeedScalar *coords;
+
   CeedVectorGetArray(mesh_coords, CEED_MEM_HOST, &coords);
   if (dim == 1) {
     for (CeedInt i = 0; i < mesh_size; i++) {
@@ -367,10 +411,12 @@ CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_c
     exact_volume = 1.;
   } else {
     CeedInt num_nodes = mesh_size / dim;
+
     for (CeedInt i = 0; i < num_nodes; i++) {
       // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
       // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
       CeedScalar u = coords[i], v = coords[i + num_nodes];
+
       u                     = 1. + u;
       v                     = M_PI_2 * v;
       coords[i]             = u * cos(v);
diff --git a/examples/ceed/ex1-volume.h b/examples/ceed/ex1-volume.h
index d78ea16c6f..581cff997e 100644
--- a/examples/ceed/ex1-volume.h
+++ b/examples/ceed/ex1-volume.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 /// A structure used to pass additional data to f_build_mass
 struct BuildContext {
@@ -14,47 +14,51 @@ struct BuildContext {
 
 /// libCEED Q-function for building quadrature data for a mass operator
 CEED_QFUNCTION(build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // in[0] is Jacobians with shape [dim, nc=dim, Q]
-  // in[1] is quadrature weights, size (Q)
   struct BuildContext *build_data = (struct BuildContext *)ctx;
-  const CeedScalar    *J = in[0], *w = in[1];
-  CeedScalar          *q_data = out[0];
+
+  // in[0] is Jacobians with shape [dim, dim, Q]
+  // in[1] is quadrature weights with shape [1, Q]
+  const CeedScalar *w      = in[1];
+  CeedScalar       *q_data = out[0];
 
   switch (build_data->dim + 10 * build_data->space_dim) {
-    case 11:
+    case 11: {
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
       // Quadrature Point Loop
-      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[i] * w[i]; }  // End of Quadrature Point Loop
-      break;
-    case 22:
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[0][0][i] * w[i]; }  // End of Quadrature Point Loop
+    } break;
+    case 22: {
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
       // Quadrature Point Loop
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-        // 0 2
-        // 1 3
-        q_data[i] = (J[i + Q * 0] * J[i + Q * 3] - J[i + Q * 1] * J[i + Q * 2]) * w[i];
+        q_data[i] = (J[0][0][i] * J[1][1][i] - J[0][1][i] * J[1][0][i]) * w[i];
       }  // End of Quadrature Point Loop
-      break;
-    case 33:
+    } break;
+    case 33: {
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
       // Quadrature Point Loop
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-        // 0 3 6
-        // 1 4 7
-        // 2 5 8
-        q_data[i] = (J[i + Q * 0] * (J[i + Q * 4] * J[i + Q * 8] - J[i + Q * 5] * J[i + Q * 7]) -
-                     J[i + Q * 1] * (J[i + Q * 3] * J[i + Q * 8] - J[i + Q * 5] * J[i + Q * 6]) +
-                     J[i + Q * 2] * (J[i + Q * 3] * J[i + Q * 7] - J[i + Q * 4] * J[i + Q * 6])) *
-                    w[i];
+        q_data[i] =
+            (J[0][0][i] * (J[1][1][i] * J[2][2][i] - J[1][2][i] * J[2][1][i]) - J[0][1][i] * (J[1][0][i] * J[2][2][i] - J[1][2][i] * J[2][0][i]) +
+             J[0][2][i] * (J[1][0][i] * J[2][1][i] - J[1][1][i] * J[2][0][i])) *
+            w[i];
       }  // End of Quadrature Point Loop
-      break;
+    } break;
   }
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 /// libCEED Q-function for applying a mass operator
 CEED_QFUNCTION(apply_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0], out[0] are solution variables with shape [1, Q]
+  // in[1] is quadrature data with shape [1, Q]
   const CeedScalar *u = in[0], *q_data = in[1];
   CeedScalar       *v = out[0];
 
   // Quadrature Point Loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = q_data[i] * u[i]; }  // End of Quadrature Point Loop
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
diff --git a/examples/ceed/ex2-surface.c b/examples/ceed/ex2-surface.c
index d536068800..2191e4dc63 100644
--- a/examples/ceed/ex2-surface.c
+++ b/examples/ceed/ex2-surface.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -60,7 +60,7 @@ int main(int argc, const char *argv[]) {
   CeedInt     sol_degree  = 4;               // polynomial degree for the solution
   CeedInt     num_qpts    = sol_degree + 2;  // number of 1D quadrature points
   CeedInt     prob_size   = -1;              // approximate problem size
-  CeedInt     help = 0, test = 0, gallery = 0;
+  CeedInt     help = 0, test = 0, gallery = 0, benchmark = 0;
 
   // Process command line arguments.
   for (int ia = 1; ia < argc; ia++) {
@@ -81,6 +81,8 @@ int main(int argc, const char *argv[]) {
       parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1;
     } else if (!strcmp(argv[ia], "-s")) {
       parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-b")) {
+      parse_error = next_arg ? benchmark = atoi(argv[++ia]), 0 : 1;
     } else if (!strcmp(argv[ia], "-t")) {
       test = 1;
     } else if (!strcmp(argv[ia], "-g")) {
@@ -119,15 +121,18 @@ int main(int argc, const char *argv[]) {
 
   // Select appropriate backend and logical device based on the (-ceed) command line argument.
   Ceed ceed;
+
   CeedInit(ceed_spec, &ceed);
 
   // Construct the mesh and solution bases.
   CeedBasis mesh_basis, sol_basis;
+
   CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis);
   CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis);
 
   // Determine the mesh size based on the given approximate problem size.
   CeedInt num_xyz[3];
+
   GetCartesianMeshSize(dim, sol_degree, prob_size, num_xyz);
 
   if (!test) {
@@ -142,6 +147,7 @@ int main(int argc, const char *argv[]) {
   // Build CeedElemRestriction objects describing the mesh and solution discrete representations.
   CeedInt             mesh_size, sol_size;
   CeedElemRestriction mesh_restriction, sol_restriction, q_data_restriction;
+
   BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restriction, NULL);
   BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, dim * (dim + 1) / 2, &sol_size, num_qpts, NULL, &q_data_restriction);
   BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restriction, NULL);
@@ -154,6 +160,7 @@ int main(int argc, const char *argv[]) {
 
   // Create a CeedVector with the mesh coordinates.
   CeedVector mesh_coords;
+
   CeedVectorCreate(ceed, mesh_size, &mesh_coords);
   SetCartesianMeshCoords(dim, num_xyz, mesh_degree, mesh_coords);
 
@@ -163,12 +170,14 @@ int main(int argc, const char *argv[]) {
   // Context data to be passed to the 'build_diff' QFunction.
   CeedQFunctionContext build_ctx;
   struct BuildContext  build_ctx_data;
+
   build_ctx_data.dim = build_ctx_data.space_dim = dim;
   CeedQFunctionContextCreate(ceed, &build_ctx);
   CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
 
   // Create the QFunction that builds the diffusion operator (i.e. computes its quadrature data) and set its context data.
   CeedQFunction qf_build;
+
   if (gallery) {
     // This creates the QFunction via the gallery.
     char name[16] = "";
@@ -185,6 +194,7 @@ int main(int argc, const char *argv[]) {
 
   // Create the operator that builds the quadrature data for the diffusion operator.
   CeedOperator op_build;
+
   CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
   CeedOperatorSetField(op_build, "dx", mesh_restriction, mesh_basis, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE);
@@ -194,15 +204,17 @@ int main(int argc, const char *argv[]) {
   CeedVector q_data;
   CeedInt    elem_qpts = CeedIntPow(num_qpts, dim);
   CeedInt    num_elem  = 1;
+
   for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d];
   CeedVectorCreate(ceed, num_elem * elem_qpts * dim * (dim + 1) / 2, &q_data);
   CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE);
 
   // Create the QFunction that defines the action of the diffusion operator.
   CeedQFunction qf_apply;
+
   if (gallery) {
     // This creates the QFunction via the gallery.
-    char name[16] = "";
+    char name[25] = "";
     snprintf(name, sizeof name, "Poisson%" CeedInt_FMT "DApply", dim);
     CeedQFunctionCreateInteriorByName(ceed, name, &qf_apply);
   } else {
@@ -216,6 +228,7 @@ int main(int argc, const char *argv[]) {
 
   // Create the diffusion operator.
   CeedOperator op_apply;
+
   CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
   CeedOperatorSetField(op_apply, "du", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data);
@@ -223,6 +236,7 @@ int main(int argc, const char *argv[]) {
 
   // Create auxiliary solution-size vectors.
   CeedVector u, v;
+
   CeedVectorCreate(ceed, sol_size, &u);
   CeedVectorCreate(ceed, sol_size, &v);
 
@@ -230,6 +244,7 @@ int main(int argc, const char *argv[]) {
   {
     CeedScalar       *u_array;
     const CeedScalar *x_array;
+
     CeedVectorGetArrayWrite(u, CEED_MEM_HOST, &u_array);
     CeedVectorGetArrayRead(mesh_coords, CEED_MEM_HOST, &x_array);
     for (CeedInt i = 0; i < sol_size; i++) {
@@ -243,10 +258,23 @@ int main(int argc, const char *argv[]) {
   // Compute the mesh surface area using the diff operator: surface_area = 1^T \cdot abs( K \cdot x).
   CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
 
+  // Benchmark runs
+  if (!test && benchmark) {
+    // LCOV_EXCL_START
+    printf(" Executing %d benchmarking runs...\n", benchmark);
+    // LCOV_EXCL_STOP
+  }
+  for (CeedInt i = 0; i < benchmark; i++) {
+    // LCOV_EXCL_START
+    CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
+    // LCOV_EXCL_STOP
+  }
+
   // Compute and print the sum of the entries of 'v' giving the mesh surface area.
   CeedScalar surface_area = 0.;
   {
     const CeedScalar *v_array;
+
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < sol_size; i++) surface_area += fabs(v_array[i]);
     CeedVectorRestoreArrayRead(v, &v_array);
@@ -260,6 +288,7 @@ int main(int argc, const char *argv[]) {
     // LCOV_EXCL_STOP
   } else {
     CeedScalar tol = (dim == 1 ? 10000. * CEED_EPSILON : dim == 2 ? 1E-1 : 1E-1);
+
     if (fabs(surface_area - exact_surface_area) > tol) printf("Surface area error         : % .14g\n", surface_area - exact_surface_area);
   }
 
@@ -287,13 +316,16 @@ int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt
   //    prob_size ~ num_elem * degree^dim
   CeedInt num_elem = prob_size / CeedIntPow(degree, dim);
   CeedInt s        = 0;  // find s: num_elem/2 < 2^s <= num_elem
+
   while (num_elem > 1) {
     num_elem /= 2;
     s++;
   }
   CeedInt r = s % dim;
+
   for (CeedInt d = 0; d < dim; d++) {
     CeedInt sd = s / dim;
+
     if (r > 0) {
       sd++;
       r--;
@@ -309,6 +341,7 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedIn
   CeedInt num_nodes = CeedIntPow(p, dim);         // number of scalar nodes per element
   CeedInt elem_qpts = CeedIntPow(num_qpts, dim);  // number of qpts per element
   CeedInt nd[3], num_elem = 1, scalar_size = 1;
+
   for (CeedInt d = 0; d < dim; d++) {
     num_elem *= num_xyz[d];
     nd[d] = num_xyz[d] * (p - 1) + 1;
@@ -319,15 +352,19 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedIn
   //           |---*-...-*---|---*-...-*---|- ... -|--...--|
   // num_nodes:   0   1    p-1  p  p+1       2*p             n*p
   CeedInt *el_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes);
+
   for (CeedInt e = 0; e < num_elem; e++) {
     CeedInt e_xyz[3] = {1, 1, 1}, re = e;
+
     for (CeedInt d = 0; d < dim; d++) {
       e_xyz[d] = re % num_xyz[d];
       re /= num_xyz[d];
     }
     CeedInt *local_elem_nodes = el_nodes + e * num_nodes;
+
     for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) {
       CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes;
+
       for (CeedInt d = 0; d < dim; d++) {
         g_nodes += (e_xyz[d] * (p - 1) + r_nodes % p) * g_nodes_stride;
         g_nodes_stride *= nd[d];
@@ -336,9 +373,10 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedIn
       local_elem_nodes[l_nodes] = g_nodes;
     }
   }
-  if (restriction)
+  if (restriction) {
     CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, el_nodes,
                               restriction);
+  }
   free(el_nodes);
 
   if (q_data_restriction) {
@@ -351,20 +389,25 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedIn
 int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[3], CeedInt mesh_degree, CeedVector mesh_coords) {
   CeedInt p = mesh_degree + 1;
   CeedInt nd[3], scalar_size = 1;
+
   for (CeedInt d = 0; d < dim; d++) {
     nd[d] = num_xyz[d] * (p - 1) + 1;
     scalar_size *= nd[d];
   }
   CeedScalar *coords;
+
   CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords);
   CeedScalar *nodes = malloc(sizeof(CeedScalar) * p);
+
   // The H1 basis uses Lobatto quadrature points as nodes.
   CeedLobattoQuadrature(p, nodes, NULL);  // nodes are in [-1,1]
   for (CeedInt i = 0; i < p; i++) nodes[i] = 0.5 + 0.5 * nodes[i];
   for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) {
     CeedInt r_nodes = gs_nodes;
+
     for (CeedInt d = 0; d < dim; d++) {
-      CeedInt d1d                        = r_nodes % nd[d];
+      CeedInt d1d = r_nodes % nd[d];
+
       coords[gs_nodes + scalar_size * d] = ((d1d / (p - 1)) + nodes[d1d % (p - 1)]) / num_xyz[d];
       r_nodes /= nd[d];
     }
@@ -388,6 +431,5 @@ CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_c
     coords[i] = 0.5 + 1. / sqrt(3.) * sin((2. / 3.) * M_PI * (coords[i] - 0.5));
   }
   CeedVectorRestoreArray(mesh_coords, &coords);
-
   return exact_surface_area;
 }
diff --git a/examples/ceed/ex2-surface.h b/examples/ceed/ex2-surface.h
index 4258a1e944..c8aa53b29b 100644
--- a/examples/ceed/ex2-surface.h
+++ b/examples/ceed/ex2-surface.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 /// A structure used to pass additional data to f_build_diff
 struct BuildContext {
@@ -15,110 +15,126 @@ struct BuildContext {
 /// libCEED Q-function for building quadrature data for a diffusion operator
 CEED_QFUNCTION(build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   struct BuildContext *build_data = (struct BuildContext *)ctx;
-  // in[0] is Jacobians with shape [dim, nc=dim, Q]
+
+  // in[0] is Jacobians with shape [dim, dim, Q]
   // in[1] is quadrature weights, size (Q)
-  //
+  const CeedScalar *w             = in[1];
+  CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
+
   // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
   // the symmetric part of the result.
-  const CeedScalar *J = in[0], *w = in[1];
-  CeedScalar       *q_data = out[0];
-
   switch (build_data->dim + 10 * build_data->space_dim) {
-    case 11:
-      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = w[i] / J[i]; }  // End of Quadrature Point Loop
-      break;
-    case 22:
+    case 11: {
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[0][i] = w[i] / J[0][0][i]; }  // End of Quadrature Point Loop
+    } break;
+    case 22: {
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-        // J: 0 2   q_data: 0 2   adj(J):  J22 -J12
-        //    1 3          2 1           -J21  J11
-        const CeedScalar J11 = J[i + Q * 0];
-        const CeedScalar J21 = J[i + Q * 1];
-        const CeedScalar J12 = J[i + Q * 2];
-        const CeedScalar J22 = J[i + Q * 3];
-        const CeedScalar qw  = w[i] / (J11 * J22 - J21 * J12);
-        q_data[i + Q * 0]    = qw * (J12 * J12 + J22 * J22);
-        q_data[i + Q * 1]    = qw * (J11 * J11 + J21 * J21);
-        q_data[i + Q * 2]    = -qw * (J11 * J12 + J21 * J22);
+        // J: 0 2   q_data: 0 2   adj(J):  J11 -J01
+        //    1 3           2 1           -J10  J00
+        const CeedScalar J00 = J[0][0][i];
+        const CeedScalar J10 = J[0][1][i];
+        const CeedScalar J01 = J[1][0][i];
+        const CeedScalar J11 = J[1][1][i];
+        const CeedScalar qw  = w[i] / (J00 * J11 - J10 * J01);
+
+        q_data[0][i] = qw * (J01 * J01 + J11 * J11);
+        q_data[1][i] = qw * (J00 * J00 + J10 * J10);
+        q_data[2][i] = -qw * (J00 * J01 + J10 * J11);
       }  // End of Quadrature Point Loop
-      break;
-    case 33:
+    } break;
+    case 33: {
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
       CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // Compute the adjoint
         CeedScalar A[3][3];
-        for (CeedInt j = 0; j < 3; j++)
-          for (CeedInt k = 0; k < 3; k++)
+
+        for (CeedInt j = 0; j < 3; j++) {
+          for (CeedInt k = 0; k < 3; k++) {
             // Equivalent code with J as a VLA and no mod operations:
             // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1]
-            A[k][j] = J[i + Q * ((j + 1) % 3 + 3 * ((k + 1) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 2) % 3))] -
-                      J[i + Q * ((j + 1) % 3 + 3 * ((k + 2) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 1) % 3))];
+            A[k][j] =
+                J[(k + 1) % 3][(j + 1) % 3][i] * J[(k + 2) % 3][(j + 2) % 3][i] - J[(k + 2) % 3][(j + 1) % 3][i] * J[(k + 1) % 3][(j + 2) % 3][i];
+          }
+        }
 
         // Compute quadrature weight / det(J)
-        const CeedScalar qw = w[i] / (J[i + Q * 0] * A[0][0] + J[i + Q * 1] * A[0][1] + J[i + Q * 2] * A[0][2]);
+        const CeedScalar qw = w[i] / (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]);
 
         // Compute geometric factors
         // Stored in Voigt convention
         // 0 5 4
         // 5 1 3
         // 4 3 2
-        q_data[i + Q * 0] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
-        q_data[i + Q * 1] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
-        q_data[i + Q * 2] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
-        q_data[i + Q * 3] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
-        q_data[i + Q * 4] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
-        q_data[i + Q * 5] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
+        q_data[0][i] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
+        q_data[1][i] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
+        q_data[2][i] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
+        q_data[3][i] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
+        q_data[4][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
+        q_data[5][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
       }  // End of Quadrature Point Loop
-      break;
+    } break;
   }
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 /// libCEED Q-function for applying a diff operator
 CEED_QFUNCTION(apply_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   struct BuildContext *build_data = (struct BuildContext *)ctx;
-  // in[0], out[0] have shape [dim, nc=1, Q]
-  const CeedScalar *ug = in[0], *q_data = in[1];
-  CeedScalar       *vg = out[0];
+
+  // in[0], out[0] solution gradients with shape [dim, 1, Q]
+  // in[1] is quadrature data with shape [num_components, Q]
+  const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
   switch (build_data->dim) {
-    case 1:
-      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { vg[i] = ug[i] * q_data[i]; }  // End of Quadrature Point Loop
-      break;
-    case 2:
-      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-        // Read spatial derivatives of u
-        const CeedScalar du[2] = {ug[i + Q * 0], ug[i + Q * 1]};
+    case 1: {
+      const CeedScalar *ug = in[0];
+      CeedScalar       *vg = out[0];
 
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { vg[i] = ug[i] * q_data[0][i]; }  // End of Quadrature Point Loop
+    } break;
+    case 2: {
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // Read q_data (dXdxdXdx_T symmetric matrix)
         // Stored in Voigt convention
         // 0 2
         // 2 1
         const CeedScalar dXdxdXdx_T[2][2] = {
-            {q_data[i + 0 * Q], q_data[i + 2 * Q]},
-            {q_data[i + 2 * Q], q_data[i + 1 * Q]}
+            {q_data[0][i], q_data[2][i]},
+            {q_data[2][i], q_data[1][i]}
         };
+
         // j = direction of vg
-        for (int j = 0; j < 2; j++) vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j]);
+        for (int j = 0; j < 2; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j]);
       }  // End of Quadrature Point Loop
-      break;
-    case 3:
-      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-        // Read spatial derivatives of u
-        const CeedScalar du[3] = {ug[i + Q * 0], ug[i + Q * 1], ug[i + Q * 2]};
+    } break;
+    case 3: {
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[0];
 
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
         // Read q_data (dXdxdXdx_T symmetric matrix)
         // Stored in Voigt convention
         // 0 5 4
         // 5 1 3
         // 4 3 2
         const CeedScalar dXdxdXdx_T[3][3] = {
-            {q_data[i + 0 * Q], q_data[i + 5 * Q], q_data[i + 4 * Q]},
-            {q_data[i + 5 * Q], q_data[i + 1 * Q], q_data[i + 3 * Q]},
-            {q_data[i + 4 * Q], q_data[i + 3 * Q], q_data[i + 2 * Q]}
+            {q_data[0][i], q_data[5][i], q_data[4][i]},
+            {q_data[5][i], q_data[1][i], q_data[3][i]},
+            {q_data[4][i], q_data[3][i], q_data[2][i]}
         };
+
         // j = direction of vg
-        for (int j = 0; j < 3; j++) vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j] + du[2] * dXdxdXdx_T[2][j]);
+        for (int j = 0; j < 3; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j] + ug[2][i] * dXdxdXdx_T[2][j]);
       }  // End of Quadrature Point Loop
-      break;
+    } break;
   }
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
diff --git a/examples/ceed/ex3-volume.c b/examples/ceed/ex3-volume.c
new file mode 100644
index 0000000000..380882a631
--- /dev/null
+++ b/examples/ceed/ex3-volume.c
@@ -0,0 +1,418 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+//                             libCEED Example 1
+//
+// This example illustrates a simple usage of libCEED to compute the volume of a 3D body using matrix-free application of a mass operator.
+// This example also uses a diffusion operator, which provides zero contribution to the computed volume but demonstrates libCEED's ability
+// to handle multiple basis evaluation modes for the same input and output vectors.
+// Arbitrary mesh and solution degrees in 1D, 2D and 3D are supported from the same code.
+//
+// The example has no dependencies, and is designed to be self-contained.
+// For additional examples that use external discretization libraries (MFEM, PETSc, etc.) see the subdirectories in libceed/examples.
+//
+// All libCEED objects use a Ceed device object constructed based on a command line argument (-ceed).
+//
+// Build with:
+//
+//     make ex3-volume [CEED_DIR=</path/to/libceed>]
+//
+// Sample runs:
+//
+//     ./ex3-volume
+//     ./ex3-volume -ceed /cpu/self
+//     ./ex3-volume -ceed /gpu/cuda
+//
+// Test in 1D-3D
+//TESTARGS(name="1D User QFunction") -ceed {ceed_resource} -d 1 -t
+//TESTARGS(name="2D User QFunction") -ceed {ceed_resource} -d 2 -t
+//TESTARGS(name="3D User QFunction") -ceed {ceed_resource} -d 3 -t
+
+/// @file
+/// libCEED example using mass operator to compute volume
+
+#include "ex3-volume.h"
+
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// Auxiliary functions
+int        GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]);
+int        BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts,
+                                     CeedElemRestriction *restriction, CeedElemRestriction *q_data_restriction);
+int        SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords);
+CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords);
+
+// Main example
+int main(int argc, const char *argv[]) {
+  const char *ceed_spec   = "/cpu/self";
+  CeedInt     dim         = 3;               // dimension of the mesh
+  CeedInt     num_comp_x  = 3;               // number of x components
+  CeedInt     mesh_degree = 4;               // polynomial degree for the mesh
+  CeedInt     sol_degree  = 4;               // polynomial degree for the solution
+  CeedInt     num_qpts    = sol_degree + 2;  // number of 1D quadrature points
+  CeedInt     prob_size   = -1;              // approximate problem size
+  CeedInt     help = 0, test = 0, benchmark = 0;
+
+  // Process command line arguments.
+  for (int ia = 1; ia < argc; ia++) {
+    // LCOV_EXCL_START
+    int next_arg = ((ia + 1) < argc), parse_error = 0;
+    if (!strcmp(argv[ia], "-h")) {
+      help = 1;
+    } else if (!strcmp(argv[ia], "-c") || !strcmp(argv[ia], "-ceed")) {
+      parse_error = next_arg ? ceed_spec = argv[++ia], 0 : 1;
+    } else if (!strcmp(argv[ia], "-d")) {
+      parse_error = next_arg ? dim = atoi(argv[++ia]), 0 : 1;
+      num_comp_x                   = dim;
+    } else if (!strcmp(argv[ia], "-m")) {
+      parse_error = next_arg ? mesh_degree = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-p")) {
+      parse_error = next_arg ? sol_degree = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-q")) {
+      parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-s")) {
+      parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-b")) {
+      parse_error = next_arg ? benchmark = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-t")) {
+      test = 1;
+    }
+    if (parse_error) {
+      printf("Error parsing command line options.\n");
+      return 1;
+    }
+    // LCOV_EXCL_STOP
+  }
+  if (prob_size < 0) prob_size = test ? 8 * 16 : 256 * 1024;
+
+  // Print the values of all options:
+  if (!test || help) {
+    // LCOV_EXCL_START
+    printf("Selected options: [command line option] : <current value>\n");
+    printf("  Ceed specification     [-c] : %s\n", ceed_spec);
+    printf("  Mesh dimension         [-d] : %" CeedInt_FMT "\n", dim);
+    printf("  Mesh degree            [-m] : %" CeedInt_FMT "\n", mesh_degree);
+    printf("  Solution degree        [-p] : %" CeedInt_FMT "\n", sol_degree);
+    printf("  Num. 1D quadrature pts [-q] : %" CeedInt_FMT "\n", num_qpts);
+    printf("  Approx. # unknowns     [-s] : %" CeedInt_FMT "\n", prob_size);
+    printf("  QFunction source            : header");
+    if (help) {
+      printf("Test/quiet mode is %s\n", (test ? "ON" : "OFF (use -t to enable)"));
+      return 0;
+    }
+    printf("\n");
+    // LCOV_EXCL_STOP
+  }
+
+  // Select appropriate backend and logical device based on the (-ceed) command line argument.
+  Ceed ceed;
+
+  CeedInit(ceed_spec, &ceed);
+
+  // Construct the mesh and solution bases.
+  CeedBasis mesh_basis, sol_basis;
+
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis);
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis);
+
+  // Determine the mesh size based on the given approximate problem size.
+  CeedInt num_xyz[dim];
+
+  GetCartesianMeshSize(dim, sol_degree, prob_size, num_xyz);
+  if (!test) {
+    // LCOV_EXCL_START
+    printf("Mesh size: nx = %" CeedInt_FMT, num_xyz[0]);
+    if (dim > 1) printf(", ny = %" CeedInt_FMT, num_xyz[1]);
+    if (dim > 2) printf(", nz = %" CeedInt_FMT, num_xyz[2]);
+    printf("\n");
+    // LCOV_EXCL_STOP
+  }
+
+  // Build CeedElemRestriction objects describing the mesh and solution discrete representations.
+  CeedInt             mesh_size, sol_size;
+  CeedElemRestriction mesh_restriction, sol_restriction, q_data_restriction;
+
+  BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restriction, NULL);
+  BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1 + dim * (dim + 1) / 2, &sol_size, num_qpts, NULL, &q_data_restriction);
+  BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restriction, NULL);
+  if (!test) {
+    // LCOV_EXCL_START
+    printf("Number of mesh nodes     : %" CeedInt_FMT "\n", mesh_size / dim);
+    printf("Number of solution nodes : %" CeedInt_FMT "\n", sol_size);
+    // LCOV_EXCL_STOP
+  }
+
+  // Create a CeedVector with the mesh coordinates.
+  CeedVector mesh_coords;
+
+  CeedVectorCreate(ceed, mesh_size, &mesh_coords);
+  SetCartesianMeshCoords(dim, num_xyz, mesh_degree, mesh_coords);
+
+  // Apply a transformation to the mesh.
+  CeedScalar exact_volume = TransformMeshCoords(dim, mesh_size, mesh_coords);
+
+  // Context data to be passed to the 'build_mass_diff' QFunction.
+  CeedQFunctionContext build_ctx;
+  struct BuildContext  build_ctx_data;
+
+  build_ctx_data.dim = build_ctx_data.space_dim = dim;
+  CeedQFunctionContextCreate(ceed, &build_ctx);
+  CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
+
+  // Create the QFunction that builds the mass + diffusion operator (i.e. computes its quadrature data) and set its context data.
+  CeedQFunction qf_build;
+
+  CeedQFunctionCreateInterior(ceed, 1, build_mass_diff, build_mass_diff_loc, &qf_build);
+  CeedQFunctionAddInput(qf_build, "dx", num_comp_x * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddOutput(qf_build, "qdata", 1 + dim * (dim + 1) / 2, CEED_EVAL_NONE);
+  CeedQFunctionSetContext(qf_build, build_ctx);
+
+  // Create the operator that builds the quadrature data for the mass + diffusion operator.
+  CeedOperator op_build;
+
+  CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
+  CeedOperatorSetField(op_build, "dx", mesh_restriction, mesh_basis, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_build, "qdata", q_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+
+  // Compute the quadrature data for the mass + diffusion operator.
+  CeedVector q_data;
+  CeedInt    elem_qpts = CeedIntPow(num_qpts, dim);
+  CeedInt    num_elem  = 1;
+
+  for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d];
+  CeedVectorCreate(ceed, num_elem * elem_qpts * (1 + dim * (dim + 1) / 2), &q_data);
+  CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Create the QFunction that defines the action of the mass + diffusion operator.
+  CeedQFunction qf_apply;
+
+  CeedQFunctionCreateInterior(ceed, 1, apply_mass_diff, apply_mass_diff_loc, &qf_apply);
+  CeedQFunctionAddInput(qf_apply, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_apply, "du", dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddInput(qf_apply, "qdata", 1 + dim * (dim + 1) / 2, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_apply, "v", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddOutput(qf_apply, "dv", dim, CEED_EVAL_GRAD);
+  CeedQFunctionSetContext(qf_apply, build_ctx);
+
+  // Create the mass + diffusion operator.
+  CeedOperator op_apply;
+
+  CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
+  CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_apply, "du", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_apply, "dv", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+
+  // Create auxiliary solution-size vectors.
+  CeedVector u, v;
+
+  CeedVectorCreate(ceed, sol_size, &u);
+  CeedVectorCreate(ceed, sol_size, &v);
+
+  // Initialize 'u' with ones.
+  CeedVectorSetValue(u, 1.0);
+
+  // Compute the mesh volume using the mass + diffusion operator: volume = 1^T \cdot M \cdot 1
+  CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Benchmark runs
+  if (!test && benchmark) {
+    // LCOV_EXCL_START
+    printf(" Executing %d benchmarking runs...\n", benchmark);
+    // LCOV_EXCL_STOP
+  }
+  for (CeedInt i = 0; i < benchmark; i++) {
+    // LCOV_EXCL_START
+    CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
+    // LCOV_EXCL_STOP
+  }
+
+  // Compute and print the sum of the entries of 'v' giving the mesh volume.
+  CeedScalar volume = 0.;
+
+  {
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < sol_size; i++) volume += v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+  if (!test) {
+    // LCOV_EXCL_START
+    printf(" done.\n");
+    printf("Exact mesh volume    : % .14g\n", exact_volume);
+    printf("Computed mesh volume : % .14g\n", volume);
+    printf("Volume error         : % .14g\n", volume - exact_volume);
+    // LCOV_EXCL_STOP
+  } else {
+    CeedScalar tol = (dim == 1 ? 200. * CEED_EPSILON : dim == 2 ? 1E-5 : 1E-5);
+
+    if (fabs(volume - exact_volume) > tol) printf("Volume error : % .1e\n", volume - exact_volume);
+  }
+
+  // Free dynamically allocated memory.
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&mesh_coords);
+  CeedOperatorDestroy(&op_apply);
+  CeedQFunctionDestroy(&qf_apply);
+  CeedQFunctionContextDestroy(&build_ctx);
+  CeedOperatorDestroy(&op_build);
+  CeedQFunctionDestroy(&qf_build);
+  CeedElemRestrictionDestroy(&sol_restriction);
+  CeedElemRestrictionDestroy(&mesh_restriction);
+  CeedElemRestrictionDestroy(&q_data_restriction);
+  CeedBasisDestroy(&sol_basis);
+  CeedBasisDestroy(&mesh_basis);
+  CeedDestroy(&ceed);
+  return 0;
+}
+
+int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]) {
+  // Use the approximate formula:
+  //    prob_size ~ num_elem * degree^dim
+  CeedInt num_elem = prob_size / CeedIntPow(degree, dim);
+  CeedInt s        = 0;  // find s: num_elem/2 < 2^s <= num_elem
+
+  while (num_elem > 1) {
+    num_elem /= 2;
+    s++;
+  }
+  CeedInt r = s % dim;
+
+  for (CeedInt d = 0; d < dim; d++) {
+    CeedInt sd = s / dim;
+
+    if (r > 0) {
+      sd++;
+      r--;
+    }
+    num_xyz[d] = 1 << sd;
+  }
+  return 0;
+}
+
+int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts,
+                              CeedElemRestriction *restriction, CeedElemRestriction *q_data_restriction) {
+  CeedInt p         = degree + 1;
+  CeedInt num_nodes = CeedIntPow(p, dim);         // number of scalar nodes per element
+  CeedInt elem_qpts = CeedIntPow(num_qpts, dim);  // number of qpts per element
+  CeedInt nd[3], num_elem = 1, scalar_size = 1;
+
+  for (CeedInt d = 0; d < dim; d++) {
+    num_elem *= num_xyz[d];
+    nd[d] = num_xyz[d] * (p - 1) + 1;
+    scalar_size *= nd[d];
+  }
+  *size = scalar_size * num_comp;
+  // elem:         0             1                 n-1
+  //           |---*-...-*---|---*-...-*---|- ... -|--...--|
+  // num_nodes:   0   1    p-1  p  p+1       2*p             n*p
+  CeedInt *elem_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes);
+
+  for (CeedInt e = 0; e < num_elem; e++) {
+    CeedInt e_xyz[3] = {1, 1, 1}, re = e;
+
+    for (CeedInt d = 0; d < dim; d++) {
+      e_xyz[d] = re % num_xyz[d];
+      re /= num_xyz[d];
+    }
+    CeedInt *local_elem_nodes = elem_nodes + e * num_nodes;
+
+    for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) {
+      CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes;
+
+      for (CeedInt d = 0; d < dim; d++) {
+        g_nodes += (e_xyz[d] * (p - 1) + r_nodes % p) * g_nodes_stride;
+        g_nodes_stride *= nd[d];
+        r_nodes /= p;
+      }
+      local_elem_nodes[l_nodes] = g_nodes;
+    }
+  }
+  if (restriction) {
+    CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, elem_nodes,
+                              restriction);
+  }
+  if (q_data_restriction) {
+    CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, CEED_STRIDES_BACKEND, q_data_restriction);
+  }
+  free(elem_nodes);
+  return 0;
+}
+
+int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords) {
+  CeedInt p = mesh_degree + 1;
+  CeedInt nd[3], scalar_size = 1;
+
+  for (CeedInt d = 0; d < dim; d++) {
+    nd[d] = num_xyz[d] * (p - 1) + 1;
+    scalar_size *= nd[d];
+  }
+  CeedScalar *coords;
+
+  CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords);
+  CeedScalar *nodes = malloc(sizeof(CeedScalar) * p);
+
+  // The H1 basis uses Lobatto quadrature points as nodes.
+  CeedLobattoQuadrature(p, nodes, NULL);  // nodes are in [-1,1]
+  for (CeedInt i = 0; i < p; i++) nodes[i] = 0.5 + 0.5 * nodes[i];
+  for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) {
+    CeedInt r_nodes = gs_nodes;
+
+    for (CeedInt d = 0; d < dim; d++) {
+      CeedInt d_1d                       = r_nodes % nd[d];
+      coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) + nodes[d_1d % (p - 1)]) / num_xyz[d];
+      r_nodes /= nd[d];
+    }
+  }
+  free(nodes);
+  CeedVectorRestoreArray(mesh_coords, &coords);
+  return 0;
+}
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#define M_PI_2 1.57079632679489661923
+#endif
+
+CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords) {
+  CeedScalar  exact_volume;
+  CeedScalar *coords;
+
+  CeedVectorGetArray(mesh_coords, CEED_MEM_HOST, &coords);
+  if (dim == 1) {
+    for (CeedInt i = 0; i < mesh_size; i++) {
+      // map [0,1] to [0,1] varying the mesh density
+      coords[i] = 0.5 + 1. / sqrt(3.) * sin((2. / 3.) * M_PI * (coords[i] - 0.5));
+    }
+    exact_volume = 1.;
+  } else {
+    CeedInt num_nodes = mesh_size / dim;
+    for (CeedInt i = 0; i < num_nodes; i++) {
+      // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
+      // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
+      CeedScalar u = coords[i], v = coords[i + num_nodes];
+
+      u                     = 1. + u;
+      v                     = M_PI_2 * v;
+      coords[i]             = u * cos(v);
+      coords[i + num_nodes] = u * sin(v);
+    }
+    exact_volume = 3. / 4. * M_PI;
+  }
+  CeedVectorRestoreArray(mesh_coords, &coords);
+  return exact_volume;
+}
diff --git a/examples/ceed/ex3-volume.h b/examples/ceed/ex3-volume.h
new file mode 100644
index 0000000000..0d2c0419e4
--- /dev/null
+++ b/examples/ceed/ex3-volume.h
@@ -0,0 +1,172 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+/// A structure used to pass additional data to f_build_mass_diff
+struct BuildContext {
+  CeedInt dim, space_dim;
+};
+
+/// libCEED Q-function for building quadrature data for a mass + diffusion operator
+CEED_QFUNCTION(build_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
+  // in[0] is Jacobians with shape [dim, dim, Q]
+  // in[1] is quadrature weights, size (Q)
+  const CeedScalar *w             = in[1];
+  CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
+
+  // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
+  // the symmetric part of the result.
+  switch (build_data->dim + 10 * build_data->space_dim) {
+    case 11: {
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        q_data[0][i] = w[i] * J[0][0][i];
+
+        // Diffusion
+        q_data[1][i] = w[i] / J[0][0][i];
+      }  // End of Quadrature Point Loop
+    } break;
+    case 22: {
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // J: 0 2   q_data: 0 2   adj(J):  J22 -J12
+        //    1 3           2 1           -J10  J00
+        const CeedScalar J00 = J[0][0][i];
+        const CeedScalar J10 = J[0][1][i];
+        const CeedScalar J01 = J[1][0][i];
+        const CeedScalar J11 = J[1][1][i];
+        const CeedScalar qw  = w[i] / (J00 * J11 - J10 * J01);
+
+        // Mass
+        q_data[0][i] = w[i] * (J00 * J11 - J10 * J01);
+
+        // Diffusion
+        q_data[1][i] = qw * (J01 * J01 + J11 * J11);
+        q_data[2][i] = qw * (J00 * J00 + J10 * J10);
+        q_data[3][i] = -qw * (J00 * J01 + J10 * J11);
+      }  // End of Quadrature Point Loop
+    } break;
+    case 33: {
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Compute the adjoint
+        CeedScalar A[3][3];
+
+        for (CeedInt j = 0; j < 3; j++) {
+          for (CeedInt k = 0; k < 3; k++) {
+            // Equivalent code with J as a VLA and no mod operations:
+            // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1]
+            A[k][j] =
+                J[(k + 1) % 3][(j + 1) % 3][i] * J[(k + 2) % 3][(j + 2) % 3][i] - J[(k + 2) % 3][(j + 1) % 3][i] * J[(k + 1) % 3][(j + 2) % 3][i];
+          }
+        }
+
+        // Compute quadrature weight / det(J)
+        const CeedScalar qw = w[i] / (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]);
+
+        // Mass
+        q_data[0][i] = w[i] * (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]);
+
+        // Diffusion
+        // Stored in Voigt convention
+        // 1 6 5
+        // 6 2 4
+        // 5 4 3
+        q_data[1][i] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
+        q_data[2][i] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
+        q_data[3][i] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
+        q_data[4][i] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
+        q_data[5][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
+        q_data[6][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
+      }  // End of Quadrature Point Loop
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/// libCEED Q-function for applying a mass + diffusion operator
+CEED_QFUNCTION(apply_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
+  // in[1], out[1] solution values with shape [1, 1, Q]
+  // in[1], out[1] solution gradients with shape [dim, 1, Q]
+  // in[2] is quadrature data with shape [num_components, Q]
+  const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
+
+  switch (build_data->dim) {
+    case 1: {
+      const CeedScalar *u = in[0], *ug = in[1];
+      CeedScalar       *v = out[0], *vg = out[1];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        v[i] = q_data[0][i] * u[i];
+
+        // Diffusion
+        vg[i] = q_data[1][i] * ug[i];
+      }  // End of Quadrature Point Loop
+    } break;
+    case 2: {
+      const CeedScalar *u               = in[0];
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
+      CeedScalar *v                     = out[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[1];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        v[i] = q_data[0][i] * u[i];
+
+        // Diffusion
+        // Read q_data (dXdxdXdx_T symmetric matrix)
+        // Stored in Voigt convention
+        // 1 3
+        // 23 2
+        const CeedScalar dXdxdXdx_T[2][2] = {
+            {q_data[1][i], q_data[3][i]},
+            {q_data[3][i], q_data[2][i]}
+        };
+
+        // j = direction of vg
+        for (int j = 0; j < 2; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j]);
+      }  // End of Quadrature Point Loop
+    } break;
+    case 3: {
+      const CeedScalar *u               = in[0];
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
+      CeedScalar *v                     = out[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[1];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        v[i] = q_data[0][i] * u[i];
+
+        // Diffusion
+        // Read q_data (dXdxdXdx_T symmetric matrix)
+        // Stored in Voigt convention
+        // 1 6 5
+        // 6 2 4
+        // 5 4 3
+        const CeedScalar dXdxdXdx_T[3][3] = {
+            {q_data[1][i], q_data[6][i], q_data[5][i]},
+            {q_data[6][i], q_data[2][i], q_data[4][i]},
+            {q_data[5][i], q_data[4][i], q_data[3][i]}
+        };
+
+        // j = direction of vg
+        for (int j = 0; j < 3; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j] + ug[2][i] * dXdxdXdx_T[2][j]);
+      }  // End of Quadrature Point Loop
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
diff --git a/examples/ceed/index.md b/examples/ceed/index.md
index 5d2d7a3807..02b0f41749 100644
--- a/examples/ceed/index.md
+++ b/examples/ceed/index.md
@@ -1,45 +1,39 @@
 # Standalone libCEED
 
-The following two examples have no dependencies, and are designed to be self-contained.
-For additional examples that use external discretization libraries (MFEM, PETSc, Nek5000
-etc.) see the subdirectories in {file}`examples/`.
+The following three examples have no dependencies, and are designed to be self-contained.
+For additional examples that use external discretization libraries (MFEM, PETSc, Nek5000 etc.) see the subdirectories in {file}`examples/`.
 
 (ex1-volume)=
 
 ## Ex1-Volume
 
-This example is located in the subdirectory {file}`examples/ceed`. It illustrates a
-simple usage of libCEED to compute the volume of a given body using a matrix-free
-application of the mass operator. Arbitrary mesh and solution orders in 1D, 2D, and 3D
-are supported from the same code.
+This example is located in the subdirectory {file}`examples/ceed`.
+It illustrates a simple usage of libCEED to compute the volume of a given body using a matrix-free application of the mass operator.
+Arbitrary mesh and solution orders in 1D, 2D, and 3D are supported from the same code.
 
-This example shows how to compute line/surface/volume integrals of a 1D, 2D, or 3D
-domain $\Omega$ respectively, by applying the mass operator to a vector of
-$1$s. It computes:
+This example shows how to compute line/surface/volume integrals of a 1D, 2D, or 3D domain $\Omega$ respectively, by applying the mass operator to a vector of $1$s.
+It computes:
 
 $$
 I = \int_{\Omega} 1 \, dV .
 $$ (eq-ex1-volume)
 
-Using the same notation as in {ref}`theoretical-framework`, we write here the vector
-$u(x)\equiv 1$ in the Galerkin approximation,
-and find the volume of $\Omega$ as
+Using the same notation as in {ref}`theoretical-framework`, we write here the vector $u(x)\equiv 1$ in the Galerkin approximation, and find the volume of $\Omega$ as
 
 $$
 \sum_e \int_{\Omega_e} v(x) 1 \, dV
 $$ (volume-sum)
 
-with $v(x) \in \mathcal{V}_p = \{ v \in H^{1}(\Omega_e) \,|\, v \in P_p(\bm{I}), e=1,\ldots,N_e \}$,
-the test functions.
+with $v(x) \in \mathcal{V}_p = \{ v \in H^{1}(\Omega_e) \,|\, v \in P_p(\bm{I}), e=1,\ldots,N_e \}$, the test functions.
 
 (ex2-surface)=
 
 ## Ex2-Surface
 
-This example is located in the subdirectory {file}`examples/ceed`. It computes the
-surface area of a given body using matrix-free application of a diffusion operator.
-Similar to {ref}`Ex1-Volume`, arbitrary mesh and solution orders in 1D, 2D, and 3D
-are supported from the same code. It computes:
+This example is located in the subdirectory {file}`examples/ceed`.
+It computes the surface area of a given body using matrix-free application of a diffusion operator.
+Similar to {ref}`Ex1-Volume`, arbitrary mesh and solution orders in 1D, 2D, and 3D are supported from the same code.
+It computes:
 
 $$
 I = \int_{\partial \Omega} 1 \, dS ,
@@ -65,3 +59,29 @@ Since we have chosen $u$ such that $\nabla u \cdot \hat{\bm n} = 1$, the boundar
 $$
 \int_\Omega \nabla v \cdot \nabla u \, dV \approx \sum_e \int_{\partial \Omega_e} v(x) 1 \, dS .
 $$
+
+(ex3-volume)=
+
+## Ex3-Volume
+
+This example is located in the subdirectory {file}`examples/ceed`.
+It illustrates a more complex usage of libCEED to compute the volume of a given body using a matrix-free application of the screened Poisson operator.
+Arbitrary mesh and solution orders in 1D, 2D, and 3D are supported from the same code.
+
+This example shows how to compute line/surface/volume integrals of a 1D, 2D, or 3D domain $\Omega$ respectively, by applying the screened Poisson operator to a vector of $1$s.
+It computes:
+
+$$
+I = \int_{\Omega} \left( 1 + \nabla^2 1 \right) \, dV .
+$$ (eq-ex3-volume)
+
+Using the same notation as in {ref}`theoretical-framework`, we write here the vector $u(x)\equiv 1$ in the Galerkin approximation, and find the volume of $\Omega$ as
+
+$$
+\sum_e \int_{\Omega_e}\left( v(x) 1 + \nabla v(x) \cdot 0 \right) \, dV
+$$ (volume-sum-mass-diff)
+
+with $v(x) \in \mathcal{V}_p = \{ v \in H^{1}(\Omega_e) \,|\, v \in P_p(\bm{I}), e=1,\ldots,N_e \}$, the test functions.
+
+The addition of the Poisson term is not needed to compute the volume of the region, as shown in example 1.
+Rather, this example illustrates the ability to add multiple evaluation modes for the same input or output vector in a libCEED operator.
diff --git a/examples/deal.II/CMakeLists.txt b/examples/deal.II/CMakeLists.txt
index 272facfc00..d5de2d7ddb 100644
--- a/examples/deal.II/CMakeLists.txt
+++ b/examples/deal.II/CMakeLists.txt
@@ -1,4 +1,4 @@
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8.8)
+CMAKE_MINIMUM_REQUIRED(VERSION 3.10.0)
 
 FIND_PACKAGE(deal.II 8.0 QUIET
   HINTS ${deal.II_DIR} ${DEAL_II_DIR} ../ ../../ $ENV{DEAL_II_DIR}
@@ -11,13 +11,21 @@ IF(NOT ${deal.II_FOUND})
     )
 ENDIF()
 
-DEAL_II_INITIALIZE_CACHED_VARIABLES()
-PROJECT("bps")
+FILE(GLOB SOURCE_FILES "*.cc")
 
-DEAL_II_INITIALIZE_CACHED_VARIABLES()
+FOREACH ( source_file ${SOURCE_FILES} )
+  GET_FILENAME_COMPONENT(file_name ${source_file} NAME)
+  STRING( REPLACE ".cc" "" exec ${file_name} )
 
-ADD_EXECUTABLE(bps bps.cc)
-DEAL_II_SETUP_TARGET(bps)
+  DEAL_II_INITIALIZE_CACHED_VARIABLES()
+  PROJECT(${exec})
 
-TARGET_INCLUDE_DIRECTORIES(bps PUBLIC ${CEED_DIR}/include)
-TARGET_LINK_LIBRARIES(bps ${CEED_DIR}/lib/libceed.so)
+  DEAL_II_INITIALIZE_CACHED_VARIABLES()
+
+  ADD_EXECUTABLE(${exec} ${source_file})
+  DEAL_II_SETUP_TARGET(${exec})
+
+  TARGET_INCLUDE_DIRECTORIES(${exec} PUBLIC ${CEED_DIR}/include)
+  TARGET_LINK_LIBRARIES(${exec} ${CEED_DIR}/lib/libceed.so)
+
+ENDFOREACH ( source_file ${SOURCE_FILES} )
diff --git a/examples/deal.II/README.MD b/examples/deal.II/README.md
similarity index 59%
rename from examples/deal.II/README.MD
rename to examples/deal.II/README.md
index cd3f14a3cb..18dba6dd7c 100644
--- a/examples/deal.II/README.MD
+++ b/examples/deal.II/README.md
@@ -1,6 +1,7 @@
-An example how to write libCEED operators (BP1-BP6) within the open-source
-finite element library [deal.II](https://www.dealii.org/). As reference,
-operators are presented that use the native matrix-free infrastructure.
+## libCEED deal.II Example
+
+An example how to write libCEED operators (BP1-BP6) within the open-source finite element library [deal.II](https://www.dealii.org/).
+As reference, operators are presented that use the native matrix-free infrastructure.
 
 First compile deal.II and libCEED individually. After that, compile the deal.II example:
 
@@ -11,10 +12,14 @@ cmake ../ -DDEAL_II_DIR=~/path/to/dealii -DCEED_DIR=~/path/to/libceed
 make
 ```
 
-To run the executable, write:
+To run the executables, write:
+
+```
+./bps_cpu
+```
 
 ```
-./bps
+./bps_kokkos
 ```
 
 Optional command-line arguments are shown by adding the command-line argument "--help".
diff --git a/examples/deal.II/bps-ceed.h b/examples/deal.II/bps-ceed.h
new file mode 100644
index 0000000000..f9041d4c6f
--- /dev/null
+++ b/examples/deal.II/bps-ceed.h
@@ -0,0 +1,648 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (C) 2023 by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// The deal.II library is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE.md at
+// the top level directory of deal.II.
+//
+//  Authors: Peter Munch, Martin Kronbichler
+//
+// ---------------------------------------------------------------------
+
+#pragma once
+#ifndef bps_ceed_h
+#  define bps_ceed_h
+
+// deal.II includes
+#  include <deal.II/dofs/dof_tools.h>
+
+#  include <deal.II/fe/mapping.h>
+
+#  include <deal.II/lac/la_parallel_vector.h>
+
+#  include <deal.II/matrix_free/fe_evaluation.h>
+#  include <deal.II/matrix_free/matrix_free.h>
+#  include <deal.II/matrix_free/shape_info.h>
+#  include <deal.II/matrix_free/tools.h>
+
+// local includes
+#  include "bps.h"
+
+// libCEED includes
+#  include <ceed.h>
+#  include <ceed/backend.h>
+
+// QFunction source
+#  include "bps-qfunctions.h"
+
+using namespace dealii;
+
+
+/**
+ * Operator implementation using libCEED.
+ */
+template <int dim, typename Number, typename MemorySpace = MemorySpace::Host>
+class OperatorCeed : public OperatorBase<Number, MemorySpace>
+{
+public:
+  using VectorType = typename OperatorBase<Number, MemorySpace>::VectorType;
+
+  /**
+   * Constructor.
+   */
+  OperatorCeed(const Mapping<dim>              &mapping,
+               const DoFHandler<dim>           &dof_handler,
+               const AffineConstraints<Number> &constraints,
+               const Quadrature<dim>           &quadrature,
+               const BPType                    &bp,
+               const std::string               &resource)
+    : mapping(mapping)
+    , dof_handler(dof_handler)
+    , constraints(constraints)
+    , quadrature(quadrature)
+    , bp(bp)
+    , resource(resource)
+  {
+    reinit();
+  }
+
+  /**
+   * Destructor.
+   */
+  ~OperatorCeed()
+  {
+    CeedVectorDestroy(&src_ceed);
+    CeedVectorDestroy(&dst_ceed);
+    CeedOperatorDestroy(&op_apply);
+    CeedDestroy(&ceed);
+  }
+
+  /**
+   * Initialized internal data structures, particularly, libCEED.
+   */
+  void
+  reinit() override
+  {
+    CeedVector           metric_data;
+    CeedBasis            sol_basis;
+    CeedElemRestriction  sol_restriction;
+    CeedElemRestriction  metric_data_restriction;
+    BuildContext         build_ctx_data;
+    CeedQFunctionContext build_ctx;
+    CeedQFunction        qf_apply;
+
+    const auto &tria = dof_handler.get_triangulation();
+    const auto &fe   = dof_handler.get_fe();
+
+    const auto n_components = fe.n_components();
+
+    if (bp == BPType::BP1 || bp == BPType::BP3 || bp == BPType::BP5)
+      {
+        AssertThrow(n_components == 1, ExcInternalError());
+      }
+    else
+      {
+        AssertThrow(n_components == dim, ExcInternalError());
+      }
+
+    // 1) create CEED instance -> "MatrixFree"
+    const char *ceed_spec = resource.c_str();
+    CeedInit(ceed_spec, &ceed);
+
+    // 2) create shape functions -> "ShapeInfo"
+    const unsigned int fe_degree  = fe.tensor_degree();
+    const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size();
+    {
+      const dealii::internal::MatrixFreeFunctions::ShapeInfo<double> shape_info(quadrature, fe, 0);
+      const auto             &shape_data = shape_info.get_shape_data();
+      std::vector<CeedScalar> q_ref_1d;
+      for (const auto q : shape_data.quadrature.get_points())
+        q_ref_1d.push_back(q(0));
+
+      // transpose bases for compatibility with restriction
+      std::vector<CeedScalar> interp_1d(shape_data.shape_values.size());
+      std::vector<CeedScalar> grad_1d(shape_data.shape_gradients.size());
+      for (unsigned int i = 0; i < n_q_points; ++i)
+        for (unsigned int j = 0; j < fe_degree + 1; ++j)
+          {
+            interp_1d[j + i * (fe_degree + 1)] = shape_data.shape_values[j * n_q_points + i];
+            grad_1d[j + i * (fe_degree + 1)]   = shape_data.shape_gradients[j * n_q_points + i];
+          }
+
+      CeedBasisCreateTensorH1(ceed,
+                              dim,
+                              n_components,
+                              fe_degree + 1,
+                              n_q_points,
+                              interp_1d.data(),
+                              grad_1d.data(),
+                              q_ref_1d.data(),
+                              quadrature.get_tensor_basis()[0].get_weights().data(),
+                              &sol_basis);
+    }
+
+    // 3) create restriction matrix -> DoFInfo
+    unsigned int n_local_active_cells = 0;
+
+    for (const auto &cell : dof_handler.active_cell_iterators())
+      if (cell->is_locally_owned())
+        n_local_active_cells++;
+
+    partitioner =
+      std::make_shared<Utilities::MPI::Partitioner>(dof_handler.locally_owned_dofs(),
+                                                    DoFTools::extract_locally_active_dofs(
+                                                      dof_handler),
+                                                    dof_handler.get_communicator());
+
+    std::vector<CeedInt> indices;
+    indices.reserve(n_local_active_cells * fe.n_dofs_per_cell() / n_components);
+
+    const auto dof_mapping = FETools::lexicographic_to_hierarchic_numbering<dim>(fe_degree);
+
+    std::vector<types::global_dof_index> local_indices(fe.n_dofs_per_cell());
+
+    for (const auto &cell : dof_handler.active_cell_iterators())
+      if (cell->is_locally_owned())
+        {
+          cell->get_dof_indices(local_indices);
+
+          for (const auto i : dof_mapping)
+            indices.emplace_back(
+              partitioner->global_to_local(local_indices[fe.component_to_system_index(0, i)]));
+        }
+
+    CeedElemRestrictionCreate(ceed,
+                              n_local_active_cells,
+                              fe.n_dofs_per_cell() / n_components,
+                              n_components,
+                              1,
+                              this->extended_local_size(),
+                              CEED_MEM_HOST,
+                              CEED_COPY_VALUES,
+                              indices.data(),
+                              &sol_restriction);
+
+    // 4) create mapping -> MappingInfo
+    const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2);
+
+    metric_data_raw = compute_metric_data(ceed, mapping, tria, quadrature, bp);
+
+    strides = {{1,
+                static_cast<int>(quadrature.size()),
+                static_cast<int>(quadrature.size() * n_components_metric)}};
+    CeedVectorCreate(ceed, metric_data_raw.size(), &metric_data);
+    CeedVectorSetArray(metric_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data_raw.data());
+    CeedElemRestrictionCreateStrided(ceed,
+                                     n_local_active_cells,
+                                     quadrature.size(),
+                                     n_components_metric,
+                                     metric_data_raw.size(),
+                                     strides.data(),
+                                     &metric_data_restriction);
+
+    build_ctx_data.dim       = dim;
+    build_ctx_data.space_dim = dim;
+
+    CeedQFunctionContextCreate(ceed, &build_ctx);
+    CeedQFunctionContextSetData(
+      build_ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(build_ctx_data), &build_ctx_data);
+
+    // 5) create q operation
+    if (bp == BPType::BP1)
+      CeedQFunctionCreateInterior(ceed, 1, f_apply_mass, f_apply_mass_loc, &qf_apply);
+    else if (bp == BPType::BP2)
+      CeedQFunctionCreateInterior(ceed, 1, f_apply_mass_vec, f_apply_mass_vec_loc, &qf_apply);
+    else if (bp == BPType::BP3 || bp == BPType::BP5)
+      CeedQFunctionCreateInterior(ceed, 1, f_apply_poisson, f_apply_poisson_loc, &qf_apply);
+    else if (bp == BPType::BP4 || bp == BPType::BP6)
+      CeedQFunctionCreateInterior(ceed, 1, f_apply_poisson_vec, f_apply_poisson_vec_loc, &qf_apply);
+    else
+      AssertThrow(false, ExcInternalError());
+
+    if (bp <= BPType::BP2)
+      CeedQFunctionAddInput(qf_apply, "u", n_components, CEED_EVAL_INTERP);
+    else
+      CeedQFunctionAddInput(qf_apply, "u", dim * n_components, CEED_EVAL_GRAD);
+
+    CeedQFunctionAddInput(qf_apply, "metric data", n_components_metric, CEED_EVAL_NONE);
+
+    if (bp <= BPType::BP2)
+      CeedQFunctionAddOutput(qf_apply, "v", n_components, CEED_EVAL_INTERP);
+    else
+      CeedQFunctionAddOutput(qf_apply, "v", dim * n_components, CEED_EVAL_GRAD);
+
+    CeedQFunctionSetContext(qf_apply, build_ctx);
+
+    // 6) put everything together
+    CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
+
+    CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(
+      op_apply, "metric data", metric_data_restriction, CEED_BASIS_NONE, metric_data);
+    CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+
+    // 7) libCEED vectors
+    CeedElemRestrictionCreateVector(sol_restriction, &src_ceed, NULL);
+    CeedElemRestrictionCreateVector(sol_restriction, &dst_ceed, NULL);
+
+    // 8) cleanup
+    CeedVectorDestroy(&metric_data);
+    CeedElemRestrictionDestroy(&metric_data_restriction);
+    CeedElemRestrictionDestroy(&sol_restriction);
+    CeedBasisDestroy(&sol_basis);
+    CeedQFunctionContextDestroy(&build_ctx);
+    CeedQFunctionDestroy(&qf_apply);
+  }
+
+  /**
+   * Perform matrix-vector product.
+   */
+  void
+  vmult(VectorType &dst, const VectorType &src) const override
+  {
+    // communicate: update ghost values
+    src.update_ghost_values();
+
+    // pass memory buffers to libCEED
+    VectorTypeCeed x(src_ceed);
+    VectorTypeCeed y(dst_ceed);
+    x.import_array(src, CEED_MEM_HOST);
+    y.import_array(dst, CEED_MEM_HOST);
+
+    // apply operator
+    CeedOperatorApply(op_apply, x(), y(), CEED_REQUEST_IMMEDIATE);
+
+    // pull arrays back to deal.II
+    x.take_array();
+    y.take_array();
+
+    // communicate: compress
+    src.zero_out_ghost_values();
+    dst.compress(VectorOperation::add);
+
+    // apply constraints: we assume homogeneous DBC
+    constraints.set_zero(dst);
+  }
+
+  /**
+   * Initialized vector.
+   */
+  void
+  initialize_dof_vector(VectorType &vec) const override
+  {
+    vec.reinit(partitioner);
+  }
+
+  /**
+   * Compute inverse of diagonal.
+   */
+  void
+  compute_inverse_diagonal(VectorType &diagonal) const override
+  {
+    this->initialize_dof_vector(diagonal);
+
+    // pass memory buffer to libCEED
+    VectorTypeCeed y(dst_ceed);
+    y.import_array(diagonal, CEED_MEM_HOST);
+
+    CeedOperatorLinearAssembleDiagonal(op_apply, y(), CEED_REQUEST_IMMEDIATE);
+
+    // pull array back to deal.II
+    y.take_array();
+
+    diagonal.compress(VectorOperation::add);
+
+    // apply constraints: we assume homogeneous DBC
+    constraints.set_zero(diagonal);
+
+    for (auto &i : diagonal)
+      i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0;
+  }
+
+private:
+  /**
+   * Wrapper around a deal.II vector to create a libCEED vector view.
+   */
+  class VectorTypeCeed
+  {
+  public:
+    /**
+     * Constructor.
+     */
+    VectorTypeCeed(const CeedVector &vec_orig)
+    {
+      vec_ceed = NULL;
+      CeedVectorReferenceCopy(vec_orig, &vec_ceed);
+    }
+
+    /**
+     * Return libCEED vector view.
+     */
+    CeedVector &
+    operator()()
+    {
+      return vec_ceed;
+    }
+
+    /**
+     * Set deal.II memory in libCEED vector.
+     */
+    void
+    import_array(const VectorType &vec, const CeedMemType space)
+    {
+      mem_space = space;
+      CeedVectorSetArray(vec_ceed, mem_space, CEED_USE_POINTER, vec.get_values());
+    }
+
+    /**
+     * Sync memory from device to host.
+     */
+    void
+    sync_array()
+    {
+      CeedVectorSyncArray(vec_ceed, mem_space);
+    }
+
+    /**
+     * Take previously set deal.II array from libCEED vector
+     */
+    void
+    take_array()
+    {
+      CeedScalar *ptr;
+      CeedVectorTakeArray(vec_ceed, mem_space, &ptr);
+    }
+
+    /**
+     * Destructor: destroy vector view.
+     */
+    ~VectorTypeCeed()
+    {
+      bool has_array;
+      CeedVectorHasBorrowedArrayOfType(vec_ceed, mem_space, &has_array);
+      if (has_array)
+        {
+          CeedScalar *ptr;
+          CeedVectorTakeArray(vec_ceed, mem_space, &ptr);
+        }
+      CeedVectorDestroy(&vec_ceed);
+    }
+
+  private:
+    /**
+     * libCEED vector view.
+     */
+    CeedMemType mem_space;
+    CeedVector  vec_ceed;
+  };
+
+  /**
+   * Number of locally active DoFs.
+   */
+  unsigned int
+  extended_local_size() const
+  {
+    return partitioner->locally_owned_size() + partitioner->n_ghost_indices();
+  }
+
+  /**
+   * Compute metric data: Jacobian, ...
+   */
+  static std::vector<double>
+  compute_metric_data(const Ceed               &ceed,
+                      const Mapping<dim>       &mapping,
+                      const Triangulation<dim> &tria,
+                      const Quadrature<dim>    &quadrature,
+                      const BPType              bp)
+  {
+    std::vector<double> metric_data_raw;
+
+    CeedBasis            geo_basis;
+    CeedVector           metric_data;
+    CeedElemRestriction  metric_data_restriction;
+    CeedVector           node_coords;
+    CeedElemRestriction  geo_restriction;
+    CeedQFunctionContext build_ctx;
+    CeedQFunction        qf_build;
+    CeedOperator         op_build;
+
+    const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size();
+
+    const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2);
+
+    const auto mapping_q = dynamic_cast<const MappingQ<dim> *>(&mapping);
+
+    AssertThrow(mapping_q, ExcMessage("Wrong mapping!"));
+
+    const unsigned int fe_degree = mapping_q->get_degree();
+
+    FE_Q<dim> geo_fe(fe_degree);
+
+    {
+      const dealii::internal::MatrixFreeFunctions::ShapeInfo<double> shape_info(quadrature,
+                                                                                geo_fe,
+                                                                                0);
+      const auto             &shape_data = shape_info.get_shape_data();
+      std::vector<CeedScalar> q_ref_1d;
+      for (const auto q : shape_data.quadrature.get_points())
+        q_ref_1d.push_back(q(0));
+
+      // transpose bases for compatibility with restriction
+      std::vector<CeedScalar> interp_1d(shape_data.shape_values.size());
+      std::vector<CeedScalar> grad_1d(shape_data.shape_gradients.size());
+      for (unsigned int i = 0; i < n_q_points; ++i)
+        for (unsigned int j = 0; j < fe_degree + 1; ++j)
+          {
+            interp_1d[j + i * (fe_degree + 1)] = shape_data.shape_values[j * n_q_points + i];
+            grad_1d[j + i * (fe_degree + 1)]   = shape_data.shape_gradients[j * n_q_points + i];
+          }
+
+      CeedBasisCreateTensorH1(ceed,
+                              dim,
+                              dim,
+                              fe_degree + 1,
+                              n_q_points,
+                              interp_1d.data(),
+                              grad_1d.data(),
+                              q_ref_1d.data(),
+                              quadrature.get_tensor_basis()[0].get_weights().data(),
+                              &geo_basis);
+    }
+
+    unsigned int n_local_active_cells = 0;
+
+    for (const auto &cell : tria.active_cell_iterators())
+      if (cell->is_locally_owned())
+        n_local_active_cells++;
+
+    std::vector<double>  geo_support_points;
+    std::vector<CeedInt> geo_indices;
+
+    DoFHandler<dim> geo_dof_handler(tria);
+    geo_dof_handler.distribute_dofs(geo_fe);
+
+    const auto geo_partitioner =
+      std::make_shared<Utilities::MPI::Partitioner>(geo_dof_handler.locally_owned_dofs(),
+                                                    DoFTools::extract_locally_active_dofs(
+                                                      geo_dof_handler),
+                                                    geo_dof_handler.get_communicator());
+
+    geo_indices.reserve(n_local_active_cells * geo_fe.n_dofs_per_cell());
+
+    const auto dof_mapping = FETools::lexicographic_to_hierarchic_numbering<dim>(fe_degree);
+
+    FEValues<dim> fe_values(mapping,
+                            geo_fe,
+                            geo_fe.get_unit_support_points(),
+                            update_quadrature_points);
+
+    std::vector<types::global_dof_index> local_indices(geo_fe.n_dofs_per_cell());
+
+    const unsigned int n_points =
+      geo_partitioner->locally_owned_size() + geo_partitioner->n_ghost_indices();
+
+    geo_support_points.resize(dim * n_points);
+
+    for (const auto &cell : geo_dof_handler.active_cell_iterators())
+      if (cell->is_locally_owned())
+        {
+          fe_values.reinit(cell);
+          cell->get_dof_indices(local_indices);
+
+          for (const auto i : dof_mapping)
+            {
+              const auto index = geo_partitioner->global_to_local(local_indices[i]);
+              geo_indices.emplace_back(index * dim);
+
+              const auto point = fe_values.quadrature_point(i);
+
+              for (unsigned int d = 0; d < dim; ++d)
+                geo_support_points[index * dim + d] = point[d];
+            }
+        }
+
+    metric_data_raw.resize(n_local_active_cells * quadrature.size() * n_components_metric);
+
+    CeedInt strides[3] = {1,
+                          static_cast<int>(quadrature.size()),
+                          static_cast<int>(quadrature.size() * n_components_metric)};
+
+    CeedVectorCreate(ceed, metric_data_raw.size(), &metric_data);
+    CeedVectorSetArray(metric_data, CEED_MEM_HOST, CEED_USE_POINTER, metric_data_raw.data());
+    CeedElemRestrictionCreateStrided(ceed,
+                                     n_local_active_cells,
+                                     quadrature.size(),
+                                     n_components_metric,
+                                     metric_data_raw.size(),
+                                     strides,
+                                     &metric_data_restriction);
+
+    CeedVectorCreate(ceed, geo_support_points.size(), &node_coords);
+    CeedVectorSetArray(node_coords, CEED_MEM_HOST, CEED_USE_POINTER, geo_support_points.data());
+
+    CeedElemRestrictionCreate(ceed,
+                              n_local_active_cells,
+                              geo_fe.n_dofs_per_cell(),
+                              dim,
+                              1,
+                              geo_support_points.size(),
+                              CEED_MEM_HOST,
+                              CEED_COPY_VALUES,
+                              geo_indices.data(),
+                              &geo_restriction);
+
+    BuildContext build_ctx_data;
+    build_ctx_data.dim       = dim;
+    build_ctx_data.space_dim = dim;
+
+    CeedQFunctionContextCreate(ceed, &build_ctx);
+    CeedQFunctionContextSetData(
+      build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
+
+    // 5) create q operation
+    if (bp <= BPType::BP2)
+      CeedQFunctionCreateInterior(ceed, 1, f_build_mass, f_build_mass_loc, &qf_build);
+    else
+      CeedQFunctionCreateInterior(ceed, 1, f_build_poisson, f_build_poisson_loc, &qf_build);
+
+    CeedQFunctionAddInput(qf_build, "geo", dim * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddInput(qf_build, "weight", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddOutput(qf_build, "metric data", n_components_metric, CEED_EVAL_NONE);
+    CeedQFunctionSetContext(qf_build, build_ctx);
+
+    // 6) put everything together
+    CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
+    CeedOperatorSetField(op_build, "geo", geo_restriction, geo_basis, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(
+      op_build, "weight", CEED_ELEMRESTRICTION_NONE, geo_basis, CEED_VECTOR_NONE);
+    CeedOperatorSetField(
+      op_build, "metric data", metric_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+
+    CeedOperatorApply(op_build, node_coords, metric_data, CEED_REQUEST_IMMEDIATE);
+
+    CeedVectorDestroy(&node_coords);
+    CeedVectorSyncArray(metric_data, CEED_MEM_HOST);
+    CeedVectorDestroy(&metric_data);
+    CeedElemRestrictionDestroy(&geo_restriction);
+    CeedElemRestrictionDestroy(&metric_data_restriction);
+    CeedBasisDestroy(&geo_basis);
+    CeedQFunctionContextDestroy(&build_ctx);
+    CeedQFunctionDestroy(&qf_build);
+    CeedOperatorDestroy(&op_build);
+
+    return metric_data_raw;
+  }
+
+  /**
+   * Mapping object passed to the constructor.
+   */
+  const Mapping<dim> &mapping;
+
+  /**
+   * DoFHandler object passed to the constructor.
+   */
+  const DoFHandler<dim> &dof_handler;
+
+  /**
+   * Constraints object passed to the constructor.
+   */
+  const AffineConstraints<Number> &constraints;
+
+  /**
+   * Quadrature rule object passed to the constructor.
+   */
+  const Quadrature<dim> &quadrature;
+
+  /**
+   * Selected BP.
+   */
+  const BPType bp;
+
+  /**
+   * Resource name.
+   */
+  const std::string resource;
+
+  /**
+   * Partitioner for distributed vectors.
+   */
+  std::shared_ptr<Utilities::MPI::Partitioner> partitioner;
+
+  /**
+   * libCEED data structures.
+   */
+  Ceed                   ceed;
+  std::vector<double>    metric_data_raw;
+  std::array<CeedInt, 3> strides;
+  CeedVector             src_ceed;
+  CeedVector             dst_ceed;
+  CeedOperator           op_apply;
+};
+
+#endif
diff --git a/examples/deal.II/bps.cc b/examples/deal.II/bps-cpu.cc
similarity index 97%
rename from examples/deal.II/bps.cc
rename to examples/deal.II/bps-cpu.cc
index 9d72710d65..2355078ccf 100644
--- a/examples/deal.II/bps.cc
+++ b/examples/deal.II/bps-cpu.cc
@@ -46,7 +46,8 @@
 #include <sstream>
 
 // include operators
-#include "bps.h"
+#include "bps-ceed.h"
+#include "bps-cpu.h"
 
 // Test cases
 //TESTARGS(name="BP1") --resource {ceed_resource} --bp BP1 --fe_degree 2 --print_timings 0
@@ -61,7 +62,7 @@ struct Parameters
   unsigned int n_global_refinements = 1;
   unsigned int fe_degree            = 2;
   bool         print_timings        = true;
-  std::string  libCEED_resource      = "/cpu/self/avx/blocked";
+  std::string  libCEED_resource     = "/cpu/self";
 
   bool
   parse(int argc, char *argv[])
@@ -167,7 +168,7 @@ main(int argc, char *argv[])
 #ifdef DEAL_II_WITH_P4EST
   parallel::distributed::Triangulation<dim> tria(MPI_COMM_WORLD);
 #else
-  parallel::shared::Triangulation<dim> tria(MPI_COMM_WORLD, ::Triangulation<dim>::none, true);
+  Triangulation<dim> tria;
 #endif
 
   GridGenerator::hyper_cube(tria);
@@ -176,6 +177,8 @@ main(int argc, char *argv[])
   DoFHandler<dim> dof_handler(tria);
   dof_handler.distribute_dofs(fe);
 
+  DoFRenumbering::support_point_wise(dof_handler);
+
   AffineConstraints<Number> constraints;
 
   if (!(bp == BPType::BP1 || bp == BPType::BP2))
@@ -185,8 +188,6 @@ main(int argc, char *argv[])
       constraints.close();
     }
 
-  DoFRenumbering::support_point_wise(dof_handler);
-
   const auto test = [&](const std::string &label, const auto &op) {
     (void)label;
 
diff --git a/examples/deal.II/bps-cpu.h b/examples/deal.II/bps-cpu.h
new file mode 100644
index 0000000000..71c00cea5d
--- /dev/null
+++ b/examples/deal.II/bps-cpu.h
@@ -0,0 +1,219 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (C) 2023 by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// The deal.II library is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE.md at
+// the top level directory of deal.II.
+//
+//  Authors: Peter Munch, Martin Kronbichler
+//
+// ---------------------------------------------------------------------
+
+#pragma once
+#ifndef bps_cpu_h
+#  define bps_cpu_h
+
+// deal.II includes
+#  include <deal.II/dofs/dof_tools.h>
+
+#  include <deal.II/fe/mapping.h>
+
+#  include <deal.II/lac/la_parallel_vector.h>
+
+#  include <deal.II/matrix_free/fe_evaluation.h>
+#  include <deal.II/matrix_free/matrix_free.h>
+#  include <deal.II/matrix_free/shape_info.h>
+#  include <deal.II/matrix_free/tools.h>
+
+// local includes
+#  include "bps.h"
+
+using namespace dealii;
+
+
+
+/**
+ * Operator CPU implementation using deal.II.
+ */
+template <int dim, typename Number>
+class OperatorDealii : public OperatorBase<Number, MemorySpace::Host>
+{
+public:
+  using VectorType = typename OperatorBase<Number, MemorySpace::Host>::VectorType;
+
+  /**
+   * Constructor.
+   */
+  OperatorDealii(const Mapping<dim>              &mapping,
+                 const DoFHandler<dim>           &dof_handler,
+                 const AffineConstraints<Number> &constraints,
+                 const Quadrature<dim>           &quadrature,
+                 const BPType                    &bp)
+    : mapping(mapping)
+    , dof_handler(dof_handler)
+    , constraints(constraints)
+    , quadrature(quadrature)
+    , bp(bp)
+  {
+    reinit();
+  }
+
+  /**
+   * Destructor.
+   */
+  ~OperatorDealii() = default;
+
+  /**
+   * Initialized internal data structures, particularly, MatrixFree.
+   */
+  void
+  reinit() override
+  {
+    // configure MatrixFree
+    typename MatrixFree<dim, Number>::AdditionalData additional_data;
+    additional_data.tasks_parallel_scheme =
+      MatrixFree<dim, Number>::AdditionalData::TasksParallelScheme::none;
+
+    // create MatrixFree
+    matrix_free.reinit(mapping, dof_handler, constraints, quadrature, additional_data);
+  }
+
+  /**
+   * Matrix-vector product.
+   */
+  void
+  vmult(VectorType &dst, const VectorType &src) const override
+  {
+    if (dof_handler.get_fe().n_components() == 1)
+      {
+        matrix_free.cell_loop(&OperatorDealii::do_cell_integral_range<1>, this, dst, src, true);
+      }
+    else
+      {
+        AssertThrow(dof_handler.get_fe().n_components() == dim, ExcInternalError());
+
+        matrix_free.cell_loop(&OperatorDealii::do_cell_integral_range<dim>, this, dst, src, true);
+      }
+  }
+
+  /**
+   * Initialize vector.
+   */
+  void
+  initialize_dof_vector(VectorType &vec) const override
+  {
+    matrix_free.initialize_dof_vector(vec);
+  }
+
+  /**
+   * Compute inverse of diagonal.
+   */
+  void
+  compute_inverse_diagonal(VectorType &diagonal) const override
+  {
+    this->initialize_dof_vector(diagonal);
+
+    if (dof_handler.get_fe().n_components() == 1)
+      {
+        MatrixFreeTools::compute_diagonal(matrix_free,
+                                          diagonal,
+                                          &OperatorDealii::do_cell_integral_local<1>,
+                                          this);
+      }
+    else
+      {
+        AssertThrow(dof_handler.get_fe().n_components() == dim, ExcInternalError());
+
+        MatrixFreeTools::compute_diagonal(matrix_free,
+                                          diagonal,
+                                          &OperatorDealii::do_cell_integral_local<dim>,
+                                          this);
+      }
+
+    for (auto &i : diagonal)
+      i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0;
+  }
+
+private:
+  /**
+   * Cell integral without vector access.
+   */
+  template <int n_components>
+  void
+  do_cell_integral_local(FEEvaluation<dim, -1, 0, n_components, Number> &phi) const
+  {
+    if (bp <= BPType::BP2) // mass matrix
+      {
+        phi.evaluate(EvaluationFlags::values);
+        for (const auto q : phi.quadrature_point_indices())
+          phi.submit_value(phi.get_value(q), q);
+        phi.integrate(EvaluationFlags::values);
+      }
+    else // Poisson operator
+      {
+        phi.evaluate(EvaluationFlags::gradients);
+        for (const auto q : phi.quadrature_point_indices())
+          phi.submit_gradient(phi.get_gradient(q), q);
+        phi.integrate(EvaluationFlags::gradients);
+      }
+  }
+
+  /**
+   * Cell integral on a range of cells.
+   */
+  template <int n_components>
+  void
+  do_cell_integral_range(const MatrixFree<dim, Number>               &matrix_free,
+                         VectorType                                  &dst,
+                         const VectorType                            &src,
+                         const std::pair<unsigned int, unsigned int> &range) const
+  {
+    FEEvaluation<dim, -1, 0, n_components, Number> phi(matrix_free, range);
+
+    for (unsigned cell = range.first; cell < range.second; ++cell)
+      {
+        phi.reinit(cell);
+        phi.read_dof_values(src);            // read source vector
+        do_cell_integral_local(phi);         // cell integral
+        phi.distribute_local_to_global(dst); // write to destination vector
+      }
+  }
+
+  /**
+   * Mapping object passed to the constructor.
+   */
+  const Mapping<dim> &mapping;
+
+  /**
+   * DoFHandler object passed to the constructor.
+   */
+  const DoFHandler<dim> &dof_handler;
+
+  /**
+   * Constraints object passed to the constructor.
+   */
+  const AffineConstraints<Number> &constraints;
+
+  /**
+   * Quadrature rule object passed to the constructor.
+   */
+  const Quadrature<dim> &quadrature;
+
+  /**
+   * Selected BP.
+   */
+  const BPType bp;
+
+  /**
+   * MatrixFree object.
+   */
+  MatrixFree<dim, Number> matrix_free;
+};
+
+#endif
diff --git a/examples/deal.II/bps-kokkos.cc b/examples/deal.II/bps-kokkos.cc
new file mode 100644
index 0000000000..86ef1a1693
--- /dev/null
+++ b/examples/deal.II/bps-kokkos.cc
@@ -0,0 +1,251 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (C) 2023 by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// The deal.II library is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE.md at
+// the top level directory of deal.II.
+//
+//  Authors: Peter Munch, Martin Kronbichler
+//
+// ---------------------------------------------------------------------
+
+// deal.II includes
+#include <deal.II/base/conditional_ostream.h>
+#include <deal.II/base/mpi.h>
+#include <deal.II/base/parameter_handler.h>
+#include <deal.II/base/quadrature_lib.h>
+
+#include <deal.II/distributed/shared_tria.h>
+#include <deal.II/distributed/tria.h>
+
+#include <deal.II/dofs/dof_handler.h>
+#include <deal.II/dofs/dof_renumbering.h>
+
+#include <deal.II/fe/fe_nothing.h>
+#include <deal.II/fe/fe_q.h>
+#include <deal.II/fe/fe_system.h>
+#include <deal.II/fe/fe_tools.h>
+#include <deal.II/fe/fe_values.h>
+#include <deal.II/fe/mapping_q1.h>
+
+#include <deal.II/grid/grid_generator.h>
+
+#include <deal.II/lac/affine_constraints.h>
+#include <deal.II/lac/precondition.h>
+#include <deal.II/lac/solver_cg.h>
+
+// boost
+#include <boost/algorithm/string.hpp>
+
+#include <sstream>
+
+// include operators
+#include "bps-ceed.h"
+#include "bps-kokkos.h"
+
+// Test cases
+//TESTARGS(name="BP1") --resource {ceed_resource} --bp BP1 --fe_degree 2 --print_timings 0
+//TESTARGS(name="BP4") --resource {ceed_resource} --bp BP4 --fe_degree 1 --print_timings 0
+
+/**
+ * Relevant parameters.
+ */
+struct Parameters
+{
+  BPType       bp                   = BPType::BP5;
+  unsigned int n_global_refinements = 1;
+  unsigned int fe_degree            = 2;
+  bool         print_timings        = true;
+  std::string  libCEED_resource     = "/cpu/self";
+
+  bool
+  parse(int argc, char *argv[])
+  {
+    if (argc == 1 && (std::string(argv[0]) == "--help"))
+      {
+        std::cout << "Usage: ./bp [OPTION]..." << std::endl;
+        std::cout << std::endl;
+        std::cout << "--bp             name of benchmark (BP1-BP6)" << std::endl;
+        std::cout << "--n_refinements  number of refinements (0-)" << std::endl;
+        std::cout << "--fe_degree      polynomial degree (1-)" << std::endl;
+        std::cout << "--print_timings  name of benchmark (0, 1)" << std::endl;
+        std::cout << "--resource       name of resource (e.g., /cpu/self/avx/blocked)" << std::endl;
+
+        return true;
+      }
+
+    AssertThrow(argc % 2 == 0, ExcInternalError());
+
+    while (argc > 0)
+      {
+        std::string label(argv[0]);
+
+        if ("--bp" == label)
+          {
+            std::string bp_string(argv[1]);
+
+            if (bp_string == "BP1")
+              bp = BPType::BP1;
+            else if (bp_string == "BP2")
+              bp = BPType::BP2;
+            else if (bp_string == "BP3")
+              bp = BPType::BP3;
+            else if (bp_string == "BP4")
+              bp = BPType::BP4;
+            else if (bp_string == "BP5")
+              bp = BPType::BP5;
+            else if (bp_string == "BP6")
+              bp = BPType::BP6;
+            else
+              AssertThrow(false, ExcInternalError());
+          }
+        else if ("--n_refinements" == label)
+          {
+            n_global_refinements = std::atoi(argv[1]);
+          }
+        else if ("--fe_degree" == label)
+          {
+            fe_degree = std::atoi(argv[1]);
+          }
+        else if ("--print_timings" == label)
+          {
+            print_timings = std::atoi(argv[1]);
+          }
+        else if ("--resource" == label)
+          {
+            libCEED_resource = std::string(argv[1]);
+          }
+        else
+          {
+            AssertThrow(false, ExcNotImplemented());
+          }
+
+
+        argc -= 2;
+        argv += 2;
+      }
+
+    return false;
+  }
+};
+
+
+
+int
+main(int argc, char *argv[])
+{
+  Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv, 1);
+
+  Parameters params;
+  if (params.parse(argc - 1, argv + 1))
+    return 0;
+
+  ConditionalOStream pout(std::cout, Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) == 0);
+
+  //  configuration
+  const BPType bp = params.bp;
+
+  using Number                     = double;
+  using VectorType                 = LinearAlgebra::distributed::Vector<Number, MemorySpace::Default>;
+  const unsigned int dim           = 2;
+  const unsigned int fe_degree     = params.fe_degree;
+  const unsigned int n_q_points    = (bp <= BPType::BP4) ? (fe_degree + 2) : (fe_degree + 1);
+  const unsigned int n_refinements = params.n_global_refinements;
+  const unsigned int n_components =
+    (bp == BPType::BP1 || bp == BPType::BP3 || bp == BPType::BP5) ? 1 : dim;
+
+  // create mapping, quadrature, fe, mesh, ...
+  MappingQ1<dim> mapping;
+  QGauss<dim>    quadrature(n_q_points);
+  FESystem<dim>  fe(FE_Q<dim>(fe_degree), n_components);
+
+#ifdef DEAL_II_WITH_P4EST
+  parallel::distributed::Triangulation<dim> tria(MPI_COMM_WORLD);
+#else
+  Triangulation<dim> tria;
+#endif
+
+  GridGenerator::hyper_cube(tria);
+  tria.refine_global(n_refinements);
+
+  DoFHandler<dim> dof_handler(tria);
+  dof_handler.distribute_dofs(fe);
+
+  DoFRenumbering::support_point_wise(dof_handler);
+
+  AffineConstraints<Number> constraints;
+
+  if (!(bp == BPType::BP1 || bp == BPType::BP2))
+    {
+      // for stiffness matrix
+      DoFTools::make_zero_boundary_constraints(dof_handler, constraints);
+      constraints.close();
+    }
+
+  const auto test = [&](const std::string &label, const auto &op) {
+    (void)label;
+
+    // initialize vector
+    VectorType u, v;
+    op.initialize_dof_vector(u);
+    op.initialize_dof_vector(v);
+    u = 1.0;
+
+    constraints.set_zero(u);
+
+    // perform matrix-vector product
+    op.vmult(v, u);
+
+    // create solver
+    ReductionControl reduction_control(100, 1e-20, 1e-6);
+
+    // create preconditioner
+    DiagonalMatrix<VectorType> diagonal_matrix;
+    op.compute_inverse_diagonal(diagonal_matrix.get_vector());
+
+    std::chrono::time_point<std::chrono::system_clock> now;
+
+    bool not_converged = false;
+
+    try
+      {
+        // solve problem
+        SolverCG<VectorType> solver(reduction_control);
+        now = std::chrono::system_clock::now();
+        solver.solve(op, v, u, diagonal_matrix);
+      }
+    catch (const SolverControl::NoConvergence &)
+      {
+        pout << "Error: solver failed to converge with" << std::endl;
+        not_converged = true;
+      }
+
+
+    const auto time =
+      std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - now)
+        .count() /
+      1e9;
+
+
+    if (params.print_timings || not_converged)
+      {
+        pout << label << ": " << reduction_control.last_step() << " " << v.l2_norm() << " "
+             << (params.print_timings ? time : 0.0) << std::endl;
+      }
+  };
+
+  // create and test the libCEED operator
+  OperatorCeed<dim, Number, MemorySpace::Default> op_ceed(
+    mapping, dof_handler, constraints, quadrature, bp, params.libCEED_resource);
+  test("ceed", op_ceed);
+
+  // create and test a native deal.II operator
+  OperatorDealii<dim, Number> op_dealii(mapping, dof_handler, constraints, quadrature, bp);
+  test("dealii", op_dealii);
+}
diff --git a/examples/deal.II/bps-kokkos.h b/examples/deal.II/bps-kokkos.h
new file mode 100644
index 0000000000..bd8ba4f54f
--- /dev/null
+++ b/examples/deal.II/bps-kokkos.h
@@ -0,0 +1,327 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (C) 2023 by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// The deal.II library is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE.md at
+// the top level directory of deal.II.
+//
+//  Authors: Peter Munch, Martin Kronbichler
+//
+// ---------------------------------------------------------------------
+
+#pragma once
+#ifndef bps_kokkos_h
+#  define bps_kokkos_h
+
+// deal.II includes
+#  include <deal.II/dofs/dof_tools.h>
+
+#  include <deal.II/fe/mapping.h>
+
+#  include <deal.II/lac/la_parallel_vector.h>
+
+#  include <deal.II/matrix_free/fe_evaluation.h>
+#  include <deal.II/matrix_free/matrix_free.h>
+#  include <deal.II/matrix_free/shape_info.h>
+#  include <deal.II/matrix_free/tools.h>
+
+// local includes
+#  include "bps.h"
+
+using namespace dealii;
+
+
+
+template <int dim, int fe_degree, int n_q_points_1d, int n_components, typename Number>
+class OperatorDealiiMassQuad
+{
+public:
+  DEAL_II_HOST_DEVICE void
+  operator()(Portable::FEEvaluation<dim, fe_degree, n_q_points_1d, n_components, Number> *fe_eval,
+             const int q_point) const
+  {
+    fe_eval->submit_value(fe_eval->get_value(q_point), q_point);
+  }
+};
+
+
+
+template <int dim, int fe_degree, int n_q_points_1d, int n_components, typename Number>
+class OperatorDealiiLaplaceQuad
+{
+public:
+  DEAL_II_HOST_DEVICE void
+  operator()(Portable::FEEvaluation<dim, fe_degree, n_q_points_1d, n_components, Number> *fe_eval,
+             const int q_point) const
+  {
+    fe_eval->submit_gradient(fe_eval->get_gradient(q_point), q_point);
+  }
+};
+
+
+
+template <int dim, int fe_degree, int n_q_points_1d, int n_components, typename Number>
+class OperatorDealiiMassLocal
+{
+public:
+  DEAL_II_HOST_DEVICE void
+  operator()(const typename Portable::MatrixFree<dim, Number>::Data *data,
+             const Portable::DeviceVector<Number>                   &src,
+             Portable::DeviceVector<Number>                         &dst) const
+  {
+    Portable::FEEvaluation<dim, fe_degree, n_q_points_1d, n_components, Number> fe_eval(data);
+    fe_eval.read_dof_values(src);
+    fe_eval.evaluate(EvaluationFlags::values);
+    fe_eval.apply_for_each_quad_point(
+      OperatorDealiiMassQuad<dim, fe_degree, n_q_points_1d, n_components, Number>());
+    fe_eval.integrate(EvaluationFlags::values);
+    fe_eval.distribute_local_to_global(dst);
+  }
+
+  static const unsigned int n_local_dofs = Utilities::pow(fe_degree + 1, dim) * n_components;
+  static const unsigned int n_q_points   = Utilities::pow(n_q_points_1d, dim);
+};
+
+
+
+template <int dim, int fe_degree, int n_q_points_1d, int n_components, typename Number>
+class OperatorDealiiLaplaceLocal
+{
+public:
+  DEAL_II_HOST_DEVICE void
+  operator()(const typename Portable::MatrixFree<dim, Number>::Data *data,
+             const Portable::DeviceVector<Number>                   &src,
+             Portable::DeviceVector<Number>                         &dst) const
+  {
+    Portable::FEEvaluation<dim, fe_degree, n_q_points_1d, n_components, Number> fe_eval(data);
+    fe_eval.read_dof_values(src);
+    fe_eval.evaluate(EvaluationFlags::gradients);
+    fe_eval.apply_for_each_quad_point(
+      OperatorDealiiLaplaceQuad<dim, fe_degree, n_q_points_1d, n_components, Number>());
+    fe_eval.integrate(EvaluationFlags::gradients);
+    fe_eval.distribute_local_to_global(dst);
+  }
+
+  static const unsigned int n_local_dofs = Utilities::pow(fe_degree + 1, dim) * n_components;
+  static const unsigned int n_q_points   = Utilities::pow(n_q_points_1d, dim);
+};
+
+
+
+/**
+ * Operator GPU implementation using deal.II.
+ */
+template <int dim, typename Number>
+class OperatorDealii : public OperatorBase<Number, MemorySpace::Default>
+{
+public:
+  using VectorType = typename OperatorBase<Number, MemorySpace::Default>::VectorType;
+
+  /**
+   * Constructor.
+   */
+  OperatorDealii(const Mapping<dim>              &mapping,
+                 const DoFHandler<dim>           &dof_handler,
+                 const AffineConstraints<Number> &constraints,
+                 const Quadrature<dim>           &quadrature,
+                 const BPType                    &bp)
+    : mapping(mapping)
+    , dof_handler(dof_handler)
+    , constraints(constraints)
+    , quadrature(quadrature)
+    , bp(bp)
+  {
+    reinit();
+  }
+
+  /**
+   * Destructor.
+   */
+  ~OperatorDealii() = default;
+
+  /**
+   * Initialized internal data structures, particularly, MatrixFree.
+   */
+  void
+  reinit() override
+  {
+    // configure MatrixFree
+    typename Portable::MatrixFree<dim, Number>::AdditionalData additional_data;
+
+    if (bp <= BPType::BP2) // mass matrix
+      additional_data.mapping_update_flags = update_JxW_values | update_values;
+    else
+      additional_data.mapping_update_flags = update_JxW_values | update_gradients;
+
+    // create MatrixFree
+    AssertThrow(quadrature.is_tensor_product(), ExcNotImplemented());
+    matrix_free.reinit(
+      mapping, dof_handler, constraints, quadrature.get_tensor_basis()[0], additional_data);
+  }
+
+  /**
+   * Matrix-vector product.
+   */
+  void
+  vmult(VectorType &dst, const VectorType &src) const override
+  {
+    dst = 0.0;
+
+    const unsigned int n_components  = dof_handler.get_fe().n_components();
+    const unsigned int fe_degree     = dof_handler.get_fe().tensor_degree();
+    const unsigned int n_q_points_1d = quadrature.get_tensor_basis()[0].size();
+
+    if (n_components == 1 && fe_degree == 1 && n_q_points_1d == 2)
+      this->vmult_internal<1, 1, 2>(dst, src);
+    else if (n_components == 1 && fe_degree == 2 && n_q_points_1d == 3)
+      this->vmult_internal<1, 2, 3>(dst, src);
+    else if (n_components == dim && fe_degree == 1 && n_q_points_1d == 2)
+      this->vmult_internal<dim, 1, 2>(dst, src);
+    else if (n_components == dim && fe_degree == 2 && n_q_points_1d == 3)
+      this->vmult_internal<dim, 2, 3>(dst, src);
+    else if (n_components == 1 && fe_degree == 1 && n_q_points_1d == 3)
+      this->vmult_internal<1, 1, 3>(dst, src);
+    else if (n_components == 1 && fe_degree == 2 && n_q_points_1d == 4)
+      this->vmult_internal<1, 2, 4>(dst, src);
+    else if (n_components == dim && fe_degree == 1 && n_q_points_1d == 3)
+      this->vmult_internal<dim, 1, 3>(dst, src);
+    else if (n_components == dim && fe_degree == 2 && n_q_points_1d == 4)
+      this->vmult_internal<dim, 2, 4>(dst, src);
+    else
+      AssertThrow(false, ExcInternalError());
+
+    matrix_free.copy_constrained_values(src, dst);
+  }
+
+  /**
+   * Initialize vector.
+   */
+  void
+  initialize_dof_vector(VectorType &vec) const override
+  {
+    matrix_free.initialize_dof_vector(vec);
+  }
+
+  /**
+   * Compute inverse of diagonal.
+   */
+  void
+  compute_inverse_diagonal(VectorType &diagonal) const override
+  {
+    this->initialize_dof_vector(diagonal);
+
+    const unsigned int n_components  = dof_handler.get_fe().n_components();
+    const unsigned int fe_degree     = dof_handler.get_fe().tensor_degree();
+    const unsigned int n_q_points_1d = quadrature.get_tensor_basis()[0].size();
+
+    if (n_components == 1 && fe_degree == 1 && n_q_points_1d == 2)
+      this->compute_inverse_diagonal_internal<1, 1, 2>(diagonal);
+    else if (n_components == 1 && fe_degree == 2 && n_q_points_1d == 3)
+      this->compute_inverse_diagonal_internal<1, 2, 3>(diagonal);
+    else if (n_components == dim && fe_degree == 1 && n_q_points_1d == 2)
+      this->compute_inverse_diagonal_internal<dim, 1, 2>(diagonal);
+    else if (n_components == dim && fe_degree == 2 && n_q_points_1d == 3)
+      this->compute_inverse_diagonal_internal<dim, 2, 3>(diagonal);
+    else if (n_components == 1 && fe_degree == 1 && n_q_points_1d == 3)
+      this->compute_inverse_diagonal_internal<1, 1, 3>(diagonal);
+    else if (n_components == 1 && fe_degree == 2 && n_q_points_1d == 4)
+      this->compute_inverse_diagonal_internal<1, 2, 4>(diagonal);
+    else if (n_components == dim && fe_degree == 1 && n_q_points_1d == 3)
+      this->compute_inverse_diagonal_internal<dim, 1, 3>(diagonal);
+    else if (n_components == dim && fe_degree == 2 && n_q_points_1d == 4)
+      this->compute_inverse_diagonal_internal<dim, 2, 4>(diagonal);
+    else
+      AssertThrow(false, ExcInternalError());
+  }
+
+private:
+  /**
+   * Templated vmult function.
+   */
+  template <int n_components, int fe_degree, int n_q_points_1d>
+  void
+  vmult_internal(VectorType &dst, const VectorType &src) const
+  {
+    if (bp <= BPType::BP2) // mass matrix
+      {
+        OperatorDealiiMassLocal<dim, fe_degree, n_q_points_1d, n_components, Number> mass_operator;
+        matrix_free.cell_loop(mass_operator, src, dst);
+      }
+    else
+      {
+        OperatorDealiiLaplaceLocal<dim, fe_degree, n_q_points_1d, n_components, Number>
+          local_operator;
+        matrix_free.cell_loop(local_operator, src, dst);
+      }
+  }
+
+  /**
+   * Templated compute_inverse_diagonal function.
+   */
+  template <int n_components, int fe_degree, int n_q_points_1d>
+  void
+  compute_inverse_diagonal_internal(VectorType &diagonal) const
+  {
+    if (bp <= BPType::BP2) // mass matrix
+      {
+        OperatorDealiiMassQuad<dim, fe_degree, n_q_points_1d, n_components, Number> op_quad;
+
+        MatrixFreeTools::compute_diagonal<dim, fe_degree, n_q_points_1d, n_components, Number>(
+          matrix_free, diagonal, op_quad, EvaluationFlags::values, EvaluationFlags::values);
+      }
+    else
+      {
+        OperatorDealiiLaplaceQuad<dim, fe_degree, n_q_points_1d, n_components, Number> op_quad;
+
+        MatrixFreeTools::compute_diagonal<dim, fe_degree, n_q_points_1d, n_components, Number>(
+          matrix_free, diagonal, op_quad, EvaluationFlags::gradients, EvaluationFlags::gradients);
+      }
+
+
+    Number *diagonal_ptr = diagonal.get_values();
+
+    Kokkos::parallel_for(
+      "lethe::invert_vector",
+      Kokkos::RangePolicy<MemorySpace::Default::kokkos_space::execution_space>(
+        0, diagonal.locally_owned_size()),
+      KOKKOS_LAMBDA(int i) { diagonal_ptr[i] = 1.0 / diagonal_ptr[i]; });
+  }
+
+  /**
+   * Mapping object passed to the constructor.
+   */
+  const Mapping<dim> &mapping;
+
+  /**
+   * DoFHandler object passed to the constructor.
+   */
+  const DoFHandler<dim> &dof_handler;
+
+  /**
+   * Constraints object passed to the constructor.
+   */
+  const AffineConstraints<Number> &constraints;
+
+  /**
+   * Quadrature rule object passed to the constructor.
+   */
+  const Quadrature<dim> &quadrature;
+
+  /**
+   * Selected BP.
+   */
+  const BPType bp;
+
+  /**
+   * MatrixFree object.
+   */
+  Portable::MatrixFree<dim, Number> matrix_free;
+};
+
+#endif
diff --git a/examples/deal.II/bps-qfunctions.h b/examples/deal.II/bps-qfunctions.h
index 6161fdf840..b6a0c498c7 100644
--- a/examples/deal.II/bps-qfunctions.h
+++ b/examples/deal.II/bps-qfunctions.h
@@ -15,7 +15,7 @@
 //
 // ---------------------------------------------------------------------
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 
 
diff --git a/examples/deal.II/bps.h b/examples/deal.II/bps.h
index 677ed1a81f..b7d28919bc 100644
--- a/examples/deal.II/bps.h
+++ b/examples/deal.II/bps.h
@@ -15,24 +15,25 @@
 //
 // ---------------------------------------------------------------------
 
+#pragma once
+#ifndef bps_h
+#  define bps_h
+
 // deal.II includes
-#include <deal.II/dofs/dof_tools.h>
+#  include <deal.II/dofs/dof_tools.h>
 
-#include <deal.II/fe/mapping.h>
+#  include <deal.II/fe/mapping.h>
 
-#include <deal.II/lac/la_parallel_vector.h>
+#  include <deal.II/lac/la_parallel_vector.h>
 
-#include <deal.II/matrix_free/fe_evaluation.h>
-#include <deal.II/matrix_free/matrix_free.h>
-#include <deal.II/matrix_free/tools.h>
+#  include <deal.II/matrix_free/fe_evaluation.h>
+#  include <deal.II/matrix_free/matrix_free.h>
+#  include <deal.II/matrix_free/shape_info.h>
+#  include <deal.II/matrix_free/tools.h>
 
-// libCEED includes
-#include <ceed/ceed.h>
+using namespace dealii;
 
-// QFunction source
-#include "bps-qfunctions.h"
 
-using namespace dealii;
 
 /**
  * BP types. For more details, see https://ceed.exascaleproject.org/bps/.
@@ -92,14 +93,14 @@ struct BPInfo
 /**
  * Base class of operators.
  */
-template <typename Number>
+template <typename Number, typename MemorySpace>
 class OperatorBase
 {
 public:
   /**
    * deal.II vector type
    */
-  using VectorType = LinearAlgebra::distributed::Vector<Number>;
+  using VectorType = LinearAlgebra::distributed::Vector<Number, MemorySpace>;
 
   /**
    * Initialize vector.
@@ -126,766 +127,4 @@ class OperatorBase
   compute_inverse_diagonal(VectorType &diagonal) const = 0;
 };
 
-
-/**
- * Operator implementation using libCEED.
- */
-template <int dim, typename Number>
-class OperatorCeed : public OperatorBase<Number>
-{
-public:
-  using VectorType = typename OperatorBase<Number>::VectorType;
-
-  /**
-   * Constructor.
-   */
-  OperatorCeed(const Mapping<dim>              &mapping,
-               const DoFHandler<dim>           &dof_handler,
-               const AffineConstraints<Number> &constraints,
-               const Quadrature<dim>           &quadrature,
-               const BPType                    &bp,
-               const std::string               &resource)
-    : mapping(mapping)
-    , dof_handler(dof_handler)
-    , constraints(constraints)
-    , quadrature(quadrature)
-    , bp(bp)
-    , resource(resource)
-  {
-    reinit();
-  }
-
-  /**
-   * Destructor.
-   */
-  ~OperatorCeed()
-  {
-    CeedOperatorDestroy(&op_apply);
-    CeedQFunctionDestroy(&qf_apply);
-    CeedQFunctionContextDestroy(&build_ctx);
-    CeedVectorDestroy(&q_data);
-    CeedElemRestrictionDestroy(&q_data_restriction);
-    CeedElemRestrictionDestroy(&sol_restriction);
-    CeedBasisDestroy(&sol_basis);
-    CeedDestroy(&ceed);
-  }
-
-  /**
-   * Initialized internal data structures, particularly, libCEED.
-   */
-  void
-  reinit() override
-  {
-    const auto &tria = dof_handler.get_triangulation();
-    const auto &fe   = dof_handler.get_fe();
-
-    const auto n_components = fe.n_components();
-
-    if (bp == BPType::BP1 || bp == BPType::BP3 || bp == BPType::BP5)
-      {
-        AssertThrow(n_components == 1, ExcInternalError());
-      }
-    else
-      {
-        AssertThrow(n_components == dim, ExcInternalError());
-      }
-
-    // 1) create CEED instance -> "MatrixFree"
-    const char *ceed_spec = resource.c_str();
-    CeedInit(ceed_spec, &ceed);
-
-    // 2) create shape functions -> "ShapeInfo"
-    const unsigned int fe_degree  = fe.tensor_degree();
-    const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size();
-    CeedBasisCreateTensorH1Lagrange(
-      ceed, dim, n_components, fe_degree + 1, n_q_points, CEED_GAUSS, &sol_basis);
-
-    // 3) create restriction matrix -> DoFInfo
-    unsigned int n_local_active_cells = 0;
-
-    for (const auto &cell : dof_handler.active_cell_iterators())
-      if (cell->is_locally_owned())
-        n_local_active_cells++;
-
-    partitioner =
-      std::make_shared<Utilities::MPI::Partitioner>(dof_handler.locally_owned_dofs(),
-                                                    DoFTools::extract_locally_active_dofs(
-                                                      dof_handler),
-                                                    dof_handler.get_communicator());
-
-    std::vector<CeedInt> indices;
-    indices.reserve(n_local_active_cells * fe.n_dofs_per_cell() / n_components);
-
-    const auto dof_mapping = FETools::lexicographic_to_hierarchic_numbering<dim>(fe_degree);
-
-    std::vector<types::global_dof_index> local_indices(fe.n_dofs_per_cell());
-
-    for (const auto &cell : dof_handler.active_cell_iterators())
-      if (cell->is_locally_owned())
-        {
-          cell->get_dof_indices(local_indices);
-
-          for (const auto i : dof_mapping)
-            indices.emplace_back(
-              partitioner->global_to_local(local_indices[fe.component_to_system_index(0, i)]) /
-              n_components);
-        }
-
-    CeedElemRestrictionCreate(ceed,
-                              n_local_active_cells,
-                              fe.n_dofs_per_cell() / n_components,
-                              n_components,
-                              std::max<unsigned int>(this->extended_local_size() / n_components, 1),
-                              this->extended_local_size(),
-                              CEED_MEM_HOST,
-                              CEED_COPY_VALUES,
-                              indices.data(),
-                              &sol_restriction);
-
-    // 4) create mapping -> MappingInfo
-    const unsigned int n_components_metric = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2);
-
-    this->weights = compute_metric_data(ceed, mapping, tria, quadrature, bp);
-
-    strides = {{1,
-                static_cast<int>(quadrature.size()),
-                static_cast<int>(quadrature.size() * n_components_metric)}};
-    CeedVectorCreate(ceed, weights.size(), &q_data);
-    CeedVectorSetArray(q_data, CEED_MEM_HOST, CEED_USE_POINTER, weights.data());
-    CeedElemRestrictionCreateStrided(ceed,
-                                     n_local_active_cells,
-                                     quadrature.size(),
-                                     n_components_metric,
-                                     weights.size(),
-                                     strides.data(),
-                                     &q_data_restriction);
-
-    build_ctx_data.dim       = dim;
-    build_ctx_data.space_dim = dim;
-
-    CeedQFunctionContextCreate(ceed, &build_ctx);
-    CeedQFunctionContextSetData(
-      build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
-
-    // 5) create q operation
-    if (bp == BPType::BP1)
-      CeedQFunctionCreateInterior(ceed, 1, f_apply_mass, f_apply_mass_loc, &qf_apply);
-    else if (bp == BPType::BP2)
-      CeedQFunctionCreateInterior(ceed, 1, f_apply_mass_vec, f_apply_mass_vec_loc, &qf_apply);
-    else if (bp == BPType::BP3 || bp == BPType::BP5)
-      CeedQFunctionCreateInterior(ceed, 1, f_apply_poisson, f_apply_poisson_loc, &qf_apply);
-    else if (bp == BPType::BP4 || bp == BPType::BP6)
-      CeedQFunctionCreateInterior(ceed, 1, f_apply_poisson_vec, f_apply_poisson_vec_loc, &qf_apply);
-    else
-      AssertThrow(false, ExcInternalError());
-
-    if (bp <= BPType::BP2)
-      CeedQFunctionAddInput(qf_apply, "u", n_components, CEED_EVAL_INTERP);
-    else
-      CeedQFunctionAddInput(qf_apply, "u", dim * n_components, CEED_EVAL_GRAD);
-
-    CeedQFunctionAddInput(qf_apply, "qdata", n_components_metric, CEED_EVAL_NONE);
-
-    if (bp <= BPType::BP2)
-      CeedQFunctionAddOutput(qf_apply, "v", n_components, CEED_EVAL_INTERP);
-    else
-      CeedQFunctionAddOutput(qf_apply, "v", dim * n_components, CEED_EVAL_GRAD);
-
-    CeedQFunctionSetContext(qf_apply, build_ctx);
-
-    // 6) put everything together
-    CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
-
-    CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
-    CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data);
-    CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
-  }
-
-  /**
-   * Perform matrix-vector product.
-   */
-  void
-  vmult(VectorType &dst, const VectorType &src) const override
-  {
-    // communicate: update ghost values
-    src.update_ghost_values();
-
-    if (dof_handler.get_fe().n_components() == 1)
-      {
-        // create libCEED view on deal.II vectors
-        VectorTypeCeed src_ceed(ceed, src);
-        VectorTypeCeed dst_ceed(ceed, dst);
-
-        // apply operator
-        CeedOperatorApply(op_apply, src_ceed(), dst_ceed(), CEED_REQUEST_IMMEDIATE);
-      }
-    else // TODO: needed for multiple components
-      {
-        // allocate space for block vectors
-        src_tmp.reinit(this->extended_local_size(), true);
-        dst_tmp.reinit(this->extended_local_size(), true);
-
-        copy_to_block_vector(src_tmp, src); // copy to block vector
-
-        // create libCEED view on deal.II vectors
-        VectorTypeCeed src_ceed(ceed, src_tmp);
-        VectorTypeCeed dst_ceed(ceed, dst_tmp);
-
-        // apply operator
-        CeedOperatorApply(op_apply, src_ceed(), dst_ceed(), CEED_REQUEST_IMMEDIATE);
-
-        dst_ceed.sync_to_host();              // pull libCEED data back to host
-        copy_from_block_vector(dst, dst_tmp); // copy from block vector
-      }
-
-    // communicate: compress
-    src.zero_out_ghost_values();
-    dst.compress(VectorOperation::add);
-
-    // apply constraints: we assume homogeneous DBC
-    constraints.set_zero(dst);
-  }
-
-  /**
-   * Initialized vector.
-   */
-  void
-  initialize_dof_vector(VectorType &vec) const override
-  {
-    vec.reinit(partitioner);
-  }
-
-  /**
-   * Compute inverse of diagonal.
-   */
-  void
-  compute_inverse_diagonal(VectorType &diagonal) const override
-  {
-    this->initialize_dof_vector(diagonal);
-
-    VectorTypeCeed diagonal_ceed(ceed, diagonal);
-
-    CeedOperatorLinearAssembleDiagonal(op_apply, diagonal_ceed(), CEED_REQUEST_IMMEDIATE);
-
-    const unsigned int n_components = dof_handler.get_fe().n_components();
-
-    if (n_components > 1) // TODO: needed for multiple components
-      {
-        VectorType tmp(diagonal);
-
-        copy_from_block_vector(tmp, diagonal);
-
-        std::swap(tmp, diagonal);
-      }
-
-    diagonal.compress(VectorOperation::add);
-
-    for (auto &i : diagonal)
-      i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0;
-  }
-
-private:
-  /**
-   * Wrapper around a deal.II vector to create a libCEED vector view.
-   */
-  class VectorTypeCeed
-  {
-  public:
-    /**
-     * Constructor.
-     */
-    VectorTypeCeed(const Ceed &ceed, const VectorType &vec)
-    {
-      const unsigned int n_dofs =
-        vec.get_partitioner()->locally_owned_size() + vec.get_partitioner()->n_ghost_indices();
-
-      CeedVectorCreate(ceed, n_dofs, &vec_ceed);
-      CeedVectorSetArray(vec_ceed, CEED_MEM_HOST, CEED_USE_POINTER, vec.get_values());
-    }
-
-    /**
-     * Return libCEED vector view.
-     */
-    CeedVector &
-    operator()()
-    {
-      return vec_ceed;
-    }
-
-    /**
-     * Sync memory from device to host.
-     */
-    void
-    sync_to_host()
-    {
-      CeedVectorSyncArray(vec_ceed, CEED_MEM_HOST);
-    }
-
-    /**
-     * Destructor: destroy vector view.
-     */
-    ~VectorTypeCeed()
-    {
-      CeedScalar *ptr;
-      CeedVectorTakeArray(vec_ceed, CEED_MEM_HOST, &ptr);
-      CeedVectorDestroy(&vec_ceed);
-    }
-
-  private:
-    /**
-     * libCEED vector view.
-     */
-    CeedVector vec_ceed;
-  };
-
-  /**
-   * Copy from block vector.
-   *
-   * @note Only needed for multiple components.
-   */
-  void
-  copy_from_block_vector(VectorType &dst, const VectorType &src) const
-  {
-    const unsigned int scalar_size = this->extended_local_size() / dim;
-
-    for (unsigned int i = 0; i < scalar_size; ++i)
-      for (unsigned int j = 0; j < dim; ++j)
-        dst.get_values()[j + i * dim] = src.get_values()[j * scalar_size + i];
-  }
-
-  /**
-   * Copy to block vector.
-   *
-   * @note Only needed for multiple components.
-   */
-  void
-  copy_to_block_vector(VectorType &dst, const VectorType &src) const
-  {
-    const unsigned int scalar_size = this->extended_local_size() / dim;
-
-    for (unsigned int i = 0; i < scalar_size; ++i)
-      for (unsigned int j = 0; j < dim; ++j)
-        dst.get_values()[j * scalar_size + i] = src.get_values()[j + i * dim];
-  }
-
-  /**
-   * Number of locally active DoFs.
-   */
-  unsigned int
-  extended_local_size() const
-  {
-    return partitioner->locally_owned_size() + partitioner->n_ghost_indices();
-  }
-
-  /**
-   * Compute metric data: Jacobian, ...
-   */
-  static std::vector<double>
-  compute_metric_data(const Ceed               &ceed,
-                      const Mapping<dim>       &mapping,
-                      const Triangulation<dim> &tria,
-                      const Quadrature<dim>    &quadrature,
-                      const BPType              bp)
-  {
-    std::vector<double> weights;
-
-    if (false)
-      {
-        FE_Nothing<dim> dummy_fe;
-        FEValues<dim>   fe_values(mapping, dummy_fe, quadrature, update_JxW_values);
-
-        for (const auto &cell : tria.active_cell_iterators())
-          if (cell->is_locally_owned())
-            {
-              fe_values.reinit(cell);
-
-              for (const auto q : fe_values.quadrature_point_indices())
-                weights.emplace_back(fe_values.JxW(q));
-            }
-
-        return weights;
-      }
-
-    CeedBasis            geo_basis;
-    CeedVector           q_data;
-    CeedElemRestriction  q_data_restriction;
-    CeedVector           node_coords;
-    CeedElemRestriction  geo_restriction;
-    CeedQFunctionContext build_ctx;
-    CeedQFunction        qf_build;
-    CeedOperator         op_build;
-
-    const unsigned int n_q_points = quadrature.get_tensor_basis()[0].size();
-
-    const unsigned int n_components = (bp <= BPType::BP2) ? 1 : (dim * (dim + 1) / 2);
-
-    const auto mapping_q = dynamic_cast<const MappingQ<dim> *>(&mapping);
-
-    AssertThrow(mapping_q, ExcMessage("Wrong mapping!"));
-
-    const unsigned int fe_degree = mapping_q->get_degree();
-
-    CeedBasisCreateTensorH1Lagrange(
-      ceed, dim, dim, fe_degree + 1, n_q_points, CEED_GAUSS, &geo_basis);
-
-    unsigned int n_local_active_cells = 0;
-
-    for (const auto &cell : tria.active_cell_iterators())
-      if (cell->is_locally_owned())
-        n_local_active_cells++;
-
-    std::vector<double>  geo_support_points;
-    std::vector<CeedInt> geo_indices;
-
-    FE_Q<dim> geo_fe(fe_degree);
-
-    DoFHandler<dim> geo_dof_handler(tria);
-    geo_dof_handler.distribute_dofs(geo_fe);
-
-    const auto geo_partitioner =
-      std::make_shared<Utilities::MPI::Partitioner>(geo_dof_handler.locally_owned_dofs(),
-                                                    DoFTools::extract_locally_active_dofs(
-                                                      geo_dof_handler),
-                                                    geo_dof_handler.get_communicator());
-
-    geo_indices.reserve(n_local_active_cells * geo_fe.n_dofs_per_cell());
-
-    const auto dof_mapping = FETools::lexicographic_to_hierarchic_numbering<dim>(fe_degree);
-
-    FEValues<dim> fe_values(mapping,
-                            geo_fe,
-                            geo_fe.get_unit_support_points(),
-                            update_quadrature_points);
-
-    std::vector<types::global_dof_index> local_indices(geo_fe.n_dofs_per_cell());
-
-    const unsigned int n_points =
-      geo_partitioner->locally_owned_size() + geo_partitioner->n_ghost_indices();
-
-    geo_support_points.resize(dim * n_points);
-
-    for (const auto &cell : geo_dof_handler.active_cell_iterators())
-      if (cell->is_locally_owned())
-        {
-          fe_values.reinit(cell);
-          cell->get_dof_indices(local_indices);
-
-          for (const auto i : dof_mapping)
-            {
-              const auto index = geo_partitioner->global_to_local(local_indices[i]);
-              geo_indices.emplace_back(index);
-
-              const auto point = fe_values.quadrature_point(i);
-
-              for (unsigned int d = 0; d < dim; ++d)
-                geo_support_points[index + d * n_points] = point[d];
-            }
-        }
-
-    weights.resize(n_local_active_cells * quadrature.size() * n_components);
-
-    CeedInt strides[3] = {1,
-                          static_cast<int>(quadrature.size()),
-                          static_cast<int>(quadrature.size() * n_components)};
-
-    CeedVectorCreate(ceed, weights.size(), &q_data);
-    CeedVectorSetArray(q_data, CEED_MEM_HOST, CEED_USE_POINTER, weights.data());
-    CeedElemRestrictionCreateStrided(ceed,
-                                     n_local_active_cells,
-                                     quadrature.size(),
-                                     n_components,
-                                     weights.size(),
-                                     strides,
-                                     &q_data_restriction);
-
-    CeedVectorCreate(ceed, geo_support_points.size(), &node_coords);
-    CeedVectorSetArray(node_coords, CEED_MEM_HOST, CEED_USE_POINTER, geo_support_points.data());
-
-    CeedElemRestrictionCreate(ceed,
-                              n_local_active_cells,
-                              geo_fe.n_dofs_per_cell(),
-                              dim,
-                              std::max<unsigned int>(geo_support_points.size() / dim, 1),
-                              geo_support_points.size(),
-                              CEED_MEM_HOST,
-                              CEED_COPY_VALUES,
-                              geo_indices.data(),
-                              &geo_restriction);
-
-    BuildContext build_ctx_data;
-    build_ctx_data.dim       = dim;
-    build_ctx_data.space_dim = dim;
-
-    CeedQFunctionContextCreate(ceed, &build_ctx);
-    CeedQFunctionContextSetData(
-      build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
-
-    // 5) create q operation
-    if (bp <= BPType::BP2)
-      CeedQFunctionCreateInterior(ceed, 1, f_build_mass, f_build_mass_loc, &qf_build);
-    else
-      CeedQFunctionCreateInterior(ceed, 1, f_build_poisson, f_build_poisson_loc, &qf_build);
-
-    CeedQFunctionAddInput(qf_build, "geo", dim * dim, CEED_EVAL_GRAD);
-    CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT);
-    CeedQFunctionAddOutput(qf_build, "qdata", n_components, CEED_EVAL_NONE);
-    CeedQFunctionSetContext(qf_build, build_ctx);
-
-    // 6) put everything together
-    CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
-    CeedOperatorSetField(op_build, "geo", geo_restriction, geo_basis, CEED_VECTOR_ACTIVE);
-    CeedOperatorSetField(
-      op_build, "weights", CEED_ELEMRESTRICTION_NONE, geo_basis, CEED_VECTOR_NONE);
-    CeedOperatorSetField(
-      op_build, "qdata", q_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-
-    CeedOperatorApply(op_build, node_coords, q_data, CEED_REQUEST_IMMEDIATE);
-
-    CeedOperatorDestroy(&op_build);
-    CeedQFunctionDestroy(&qf_build);
-    CeedQFunctionContextDestroy(&build_ctx);
-    CeedElemRestrictionDestroy(&geo_restriction);
-    CeedVectorDestroy(&node_coords);
-    CeedElemRestrictionDestroy(&q_data_restriction);
-    CeedVectorSyncArray(q_data, CEED_MEM_HOST);
-    CeedVectorDestroy(&q_data);
-    CeedBasisDestroy(&geo_basis);
-
-    return weights;
-  }
-
-  /**
-   * Mapping object passed to the constructor.
-   */
-  const Mapping<dim> &mapping;
-
-  /**
-   * DoFHandler object passed to the constructor.
-   */
-  const DoFHandler<dim> &dof_handler;
-
-  /**
-   * Constraints object passed to the constructor.
-   */
-  const AffineConstraints<Number> &constraints;
-
-  /**
-   * Quadrature rule object passed to the constructor.
-   */
-  const Quadrature<dim> &quadrature;
-
-  /**
-   * Selected BP.
-   */
-  const BPType bp;
-
-  /**
-   * Resource name.
-   */
-  const std::string resource;
-
-  /**
-   * Partitioner for distributed vectors.
-   */
-  std::shared_ptr<Utilities::MPI::Partitioner> partitioner;
-
-  /**
-   * libCEED data structures.
-   */
-  Ceed                   ceed;
-  CeedBasis              sol_basis;
-  CeedElemRestriction    sol_restriction;
-  CeedElemRestriction    q_data_restriction;
-  std::vector<double>    weights;
-  CeedVector             q_data;
-  std::array<CeedInt, 3> strides;
-  BuildContext           build_ctx_data;
-  CeedQFunctionContext   build_ctx;
-  CeedQFunction          qf_apply;
-  CeedOperator           op_apply;
-
-  /**
-   * Temporal (tempral) vectors.
-   *
-   * @note Only needed for multiple components.
-   */
-  mutable VectorType src_tmp;
-  mutable VectorType dst_tmp;
-};
-
-
-
-template <int dim, typename Number>
-class OperatorDealii : public OperatorBase<Number>
-{
-public:
-  using VectorType = typename OperatorBase<Number>::VectorType;
-
-  /**
-   * Constructor.
-   */
-  OperatorDealii(const Mapping<dim>              &mapping,
-                 const DoFHandler<dim>           &dof_handler,
-                 const AffineConstraints<Number> &constraints,
-                 const Quadrature<dim>           &quadrature,
-                 const BPType                    &bp)
-    : mapping(mapping)
-    , dof_handler(dof_handler)
-    , constraints(constraints)
-    , quadrature(quadrature)
-    , bp(bp)
-  {
-    reinit();
-  }
-
-  /**
-   * Destructor.
-   */
-  ~OperatorDealii() = default;
-
-  /**
-   * Initialized internal data structures, particularly, MatrixFree.
-   */
-  void
-  reinit() override
-  {
-    // configure MatrixFree
-    typename MatrixFree<dim, Number>::AdditionalData additional_data;
-    additional_data.tasks_parallel_scheme =
-      MatrixFree<dim, Number>::AdditionalData::TasksParallelScheme::none;
-
-    // create MatrixFree
-    matrix_free.reinit(mapping, dof_handler, constraints, quadrature, additional_data);
-  }
-
-  /**
-   * Matrix-vector product.
-   */
-  void
-  vmult(VectorType &dst, const VectorType &src) const override
-  {
-    if (dof_handler.get_fe().n_components() == 1)
-      {
-        matrix_free.cell_loop(&OperatorDealii::do_cell_integral_range<1>, this, dst, src, true);
-      }
-    else
-      {
-        AssertThrow(dof_handler.get_fe().n_components() == dim, ExcInternalError());
-
-        matrix_free.cell_loop(&OperatorDealii::do_cell_integral_range<dim>, this, dst, src, true);
-      }
-  }
-
-  /**
-   * Initialize vector.
-   */
-  void
-  initialize_dof_vector(VectorType &vec) const override
-  {
-    matrix_free.initialize_dof_vector(vec);
-  }
-
-  /**
-   * Compute inverse of diagonal.
-   */
-  void
-  compute_inverse_diagonal(VectorType &diagonal) const override
-  {
-    this->initialize_dof_vector(diagonal);
-
-    if (dof_handler.get_fe().n_components() == 1)
-      {
-        MatrixFreeTools::compute_diagonal(matrix_free,
-                                          diagonal,
-                                          &OperatorDealii::do_cell_integral_local<1>,
-                                          this);
-      }
-    else
-      {
-        AssertThrow(dof_handler.get_fe().n_components() == dim, ExcInternalError());
-
-        MatrixFreeTools::compute_diagonal(matrix_free,
-                                          diagonal,
-                                          &OperatorDealii::do_cell_integral_local<dim>,
-                                          this);
-      }
-
-    for (auto &i : diagonal)
-      i = (std::abs(i) > 1.0e-10) ? (1.0 / i) : 1.0;
-  }
-
-private:
-  /**
-   * Cell integral without vector access.
-   */
-  template <int n_components>
-  void
-  do_cell_integral_local(FEEvaluation<dim, -1, 0, n_components, Number> &phi) const
-  {
-    if (bp <= BPType::BP2) // mass matrix
-      {
-        phi.evaluate(EvaluationFlags::values);
-        for (const auto q : phi.quadrature_point_indices())
-          phi.submit_value(phi.get_value(q), q);
-        phi.integrate(EvaluationFlags::values);
-      }
-    else // Poisson operator
-      {
-        phi.evaluate(EvaluationFlags::gradients);
-        for (const auto q : phi.quadrature_point_indices())
-          phi.submit_gradient(phi.get_gradient(q), q);
-        phi.integrate(EvaluationFlags::gradients);
-      }
-  }
-
-  /**
-   * Cell integral on a range of cells.
-   */
-  template <int n_components>
-  void
-  do_cell_integral_range(const MatrixFree<dim, Number>               &matrix_free,
-                         VectorType                                  &dst,
-                         const VectorType                            &src,
-                         const std::pair<unsigned int, unsigned int> &range) const
-  {
-    FEEvaluation<dim, -1, 0, n_components, Number> phi(matrix_free, range);
-
-    for (unsigned cell = range.first; cell < range.second; ++cell)
-      {
-        phi.reinit(cell);
-        phi.read_dof_values(src);            // read source vector
-        do_cell_integral_local(phi);         // cell integral
-        phi.distribute_local_to_global(dst); // write to destination vector
-      }
-  }
-
-  /**
-   * Mapping object passed to the constructor.
-   */
-  const Mapping<dim> &mapping;
-
-  /**
-   * DoFHandler object passed to the constructor.
-   */
-  const DoFHandler<dim> &dof_handler;
-
-  /**
-   * Constraints object passed to the constructor.
-   */
-  const AffineConstraints<Number> &constraints;
-
-  /**
-   * Quadrature rule object passed to the constructor.
-   */
-  const Quadrature<dim> &quadrature;
-
-  /**
-   * Selected BP.
-   */
-  const BPType bp;
-
-  /**
-   * MatrixFree object.
-   */
-  MatrixFree<dim, Number> matrix_free;
-};
+#endif
diff --git a/examples/fluids/Makefile b/examples/fluids/Makefile
index f5a1d7c8c0..c99a63a0b3 100644
--- a/examples/fluids/Makefile
+++ b/examples/fluids/Makefile
@@ -23,11 +23,8 @@ PETSc.pc := $(PETSC_DIR)/$(PETSC_ARCH)/lib/pkgconfig/PETSc.pc
 CEED_DIR ?= ../..
 ceed.pc := $(CEED_DIR)/lib/pkgconfig/ceed.pc
 
-# ASAN must be left empty if you don't want to use it
-ASAN ?=
-
 CC = $(call pkgconf, --variable=ccompiler $(PETSc.pc) $(ceed.pc))
-CFLAGS = -std=c99 \
+CFLAGS = -std=c11 \
   $(call pkgconf, --variable=cflags_extra $(PETSc.pc)) \
   $(call pkgconf, --cflags-only-other $(PETSc.pc)) \
   $(OPT) $(OPT_EXAMPLES)
@@ -37,12 +34,17 @@ LDFLAGS = $(call pkgconf, --libs-only-L --libs-only-other $(PETSc.pc) $(ceed.pc)
 LDFLAGS += $(patsubst -L%, $(call pkgconf, --variable=ldflag_rpath $(PETSc.pc))%, $(call pkgconf, --libs-only-L $(PETSc.pc) $(ceed.pc)))
 LDLIBS = $(call pkgconf, --libs-only-l $(PETSc.pc) $(ceed.pc)) -lm
 
-AFLAGS ?= -fsanitize=address #-fsanitize=undefined -fno-omit-frame-pointer
+# Address Sanitizer Setup
+# ASAN must be left empty if you don't want to use it
+ASAN ?=
+AFLAGS ?= -fsanitize=address
+# Also: -fsanitize=undefined -fno-omit-frame-pointer
 CFLAGS += $(if $(ASAN),$(AFLAGS))
 FFLAGS += $(if $(ASAN),$(AFLAGS))
 LDFLAGS += $(if $(ASAN),$(AFLAGS))
 CPPFLAGS += -I./include
 
+# Source Files
 OBJDIR := build
 SRCDIR := src
 PROBLEMDIR := problems
@@ -50,24 +52,10 @@ PROBLEMDIR := problems
 src.c := navierstokes.c $(sort $(wildcard $(PROBLEMDIR)/*.c)) $(sort $(wildcard $(SRCDIR)/*.c))
 src.o = $(src.c:%.c=$(OBJDIR)/%.o)
 
-# Path to install directory for SmartRedis. Example: /software/smartredis/install
-SMARTREDIS_DIR ?=
-ifdef SMARTREDIS_DIR
-	hiredis.pc := $(SMARTREDIS_DIR)/lib/pkgconfig/hiredis.pc
-	lsmartredis:= -lsmartredis
-	redis++.pc = $(wildcard $(SMARTREDIS_DIR)/lib/pkgconfig/redis++.pc $(SMARTREDIS_DIR)/lib64/pkgconfig/redis++.pc)
-
-	CPPFLAGS += $(call pkgconf, --cflags-only-I $(hiredis.pc) $(redis++.pc))
-	LDFLAGS += $(call pkgconf, --libs-only-L --libs-only-other $(hiredis.pc) $(redis++.pc))
-	LDFLAGS += $(patsubst -L%, $(call pkgconf, --variable=ldflag_rpath $(PETSc.pc))%, $(call pkgconf, --libs-only-L $(hiredis.pc) $(redis++.pc)))
-	LDLIBS += $(call pkgconf, --libs-only-l $(hiredis.pc) $(redis++.pc)) $(lsmartredis)
-	src.c += $(sort $(wildcard $(SRCDIR)/smartsim/*.c))
-endif
-
 all: navierstokes
 
 navierstokes: $(src.o) | $(PETSc.pc) $(ceed.pc)
-	$(call quiet,LINK.o) $(CEED_LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
+	$(call quiet,LINK.o) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
 
 .SECONDEXPANSION: # to expand $$(@D)/.DIR
 %/.DIR :
@@ -77,7 +65,7 @@ navierstokes: $(src.o) | $(PETSc.pc) $(ceed.pc)
 # Quiet, color output
 quiet ?= $($(1))
 
-$(OBJDIR)/%.o : %.c | $$(@D)/.DIR
+$(OBJDIR)/%.o : %.c  Makefile | $$(@D)/.DIR
 	$(call quiet,CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $(abspath $<)
 
 print: $(PETSc.pc) $(ceed.pc)
diff --git a/examples/fluids/README.md b/examples/fluids/README.md
index 179e3950c8..ec0ea560e1 100644
--- a/examples/fluids/README.md
+++ b/examples/fluids/README.md
@@ -1,7 +1,8 @@
 ## libCEED: Navier-Stokes Example
 
 This page provides a description of the Navier-Stokes example for the libCEED library, based on PETSc.
-PETSc v3.17 or a development version of PETSc at commit 0e95d842 or later is required.
+
+HONEE, a more fully featured fluid dynamics solver, can be found on [GitLab](https://gitlab.com/phypid/honee).
 
 The Navier-Stokes problem solves the compressible Navier-Stokes equations in three dimensions using an explicit time integration.
 The state variables are mass density, momentum density, and energy density.
@@ -20,15 +21,9 @@ and run with:
 ./navierstokes -ceed [ceed] -problem [problem type] -degree [degree]
 ```
 
-If you want to do *in situ* machine-learning training, specify `SMARTREDIS_DIR` in the make command like:
-
-```
-make SMARTREDIS_DIR=~/software/smartredis/install
-```
-
 ## Runtime options
 
-% inclusion-fluids-marker
+<!-- fluids-inclusion -->
 
 The Navier-Stokes mini-app is controlled via command-line options.
 The following options are common among all problem types:
@@ -615,7 +610,7 @@ For the Density Current, Channel, and Blasius problems, the following common com
   - boolean
 
 * - `-state_var`
-  - State variables to solve solution with. `conservative` ($\rho, \rho \bm{u}, \rho e$) or `primitive` ($P, \bm{u}, T$)
+  - State variables to solve solution with. `conservative` ($\rho, \rho \bm{u}, \rho e$), `primitive` ($P, \bm{u}, T$), or `entropy` ($\frac{\gamma - s}{\gamma - 1} - \frac{\rho}{P} (e - c_v T),\ \frac{\rho}{P} \bm{u},\ -\frac{\rho}{P}$) where  $s = \ln(P\rho^{-\gamma})$
   - `conservative`
   - string
 
@@ -634,25 +629,10 @@ For the Density Current, Channel, and Blasius problems, the following common com
   - `0`
   - `m`
 
-* - `-sgs_model_type`
-  - Type of subgrid stress model to use. Currently only `data_driven` is available
-  - `none`
-  - string
-
-* - `-sgs_model_dd_leakyrelu_alpha`
-  - Slope parameter for Leaky ReLU activation function. `0` corresponds to normal ReLU
-  - 0
-  -
-
-* - `-sgs_model_dd_parameter_dir`
-  - Path to directory with data-driven model parameters (weights, biases, etc.)
-  - `./dd_sgs_parameters`
-  - string
-
-* - `-sgs_model_dd_use_fused`
-  - Whether to use "fused" mode for data-driven model evaluation
-  - `true`
-  - boolean
+* - `-idl_pressure`
+  - Pressure used for IDL reference pressure
+  -  `-reference_pressure`
+  - `Pa`
 
 * - `-diff_filter_monitor`
   - Enable differential filter TSMonitor
@@ -688,31 +668,6 @@ For the Density Current, Channel, and Blasius problems, the following common com
   - Friction length associated with the flow, $\delta_\nu$. Used in wall-damping functions
   - 0
   - `m`
-
-* - `-sgs_train_enable`
-  - Whether to enable *in situ* training of data-driven SGS model. Require building with SmartRedis.
-  - `false`
-  - boolean
-
-* - `-sgs_train_write_data_interval`
-  - Number of timesteps between writing training data into SmartRedis database
-  - `1`
-  -
-
-* - `-sgs_train_overwrite_data`
-  - Whether new training data should overwrite old data on database
-  - `true`
-  - boolean
-
-* - `-sgs_train_filter_widths`
-  - List of scalar values for different filter widths to calculate for training data
-  -
-  - `m`
-
-* - `-smartsim_collocated_num_ranks`
-  - Number of MPI ranks associated with each collocated database (i.e. ranks per node)
-  - `1`
-  -
 :::
 
 #### Gaussian Wave
@@ -948,6 +903,11 @@ The Blasius problem has the following command-line options in addition to the Ne
   - `288`
   - `K`
 
+* - `-pressure_infinity`
+  - Atmospheric pressure, also sets IDL reference pressure
+  - `1.01E5`
+  - `Pa`
+
 * - `-temperature_wall`
   - Wall temperature
   - `288`
@@ -958,11 +918,6 @@ The Blasius problem has the following command-line options in addition to the Ne
   - `4.2e-3`
   - `m`
 
-* - `-P0`
-  - Atmospheric pressure
-  - `1.01E5`
-  - `Pa`
-
 * - `-platemesh_modify_mesh`
   - Whether to modify the mesh using the given options below.
   - `false`
@@ -1067,6 +1022,16 @@ Using the STG Inflow for the blasius problem adds the following command-line opt
   - `false`
   -
 
+* - `-stg_dx`
+  - Set the element size in the x direction. Default is calculated for box meshes, assuming equispaced elements.
+  -
+  - `m`
+
+* - `-stg_h_scale_factor`
+  - Scale element size for cutoff frequency calculation
+  - $1/p$
+  -
+
 :::
 
 This problem can be run with the `blasius.yaml` file via:
diff --git a/examples/fluids/dd_sgs_data/OutScaling.dat b/examples/fluids/dd_sgs_data/OutScaling.dat
deleted file mode 100644
index 94dab73636..0000000000
--- a/examples/fluids/dd_sgs_data/OutScaling.dat
+++ /dev/null
@@ -1,13 +0,0 @@
-12 1
-0e+00
-2e+00
-0e+00
-2e+00
-0e+00
-2e+00
--1e+00
-1e+00
--1e+00
-1e+00
--1e+00
-1e+00
diff --git a/examples/fluids/dd_sgs_data/b1.dat b/examples/fluids/dd_sgs_data/b1.dat
deleted file mode 100644
index 873f658ea9..0000000000
--- a/examples/fluids/dd_sgs_data/b1.dat
+++ /dev/null
@@ -1,21 +0,0 @@
-20 1
-4.899884770038e-01
-3.563204159517e-01
-2.627287776915e-01
-2.951473061921e-02
--4.622340771977e-01
--1.209842939357e-02
--4.663763370896e-01
-8.796932075820e-02
-4.501638907868e-01
-2.077678516370e-01
--1.139284062351e-01
--3.303352644675e-01
--4.148295154500e-01
--4.833042778786e-02
-2.972372410179e-02
--2.464389991227e-01
--2.877421872362e-01
--4.567405721457e-01
-4.734193646824e-01
--4.818997410080e-01
diff --git a/examples/fluids/dd_sgs_data/b2.dat b/examples/fluids/dd_sgs_data/b2.dat
deleted file mode 100644
index 4ff5bd0b30..0000000000
--- a/examples/fluids/dd_sgs_data/b2.dat
+++ /dev/null
@@ -1,7 +0,0 @@
-6 1
-1.176169920799e-01
--2.134958413350e-01
-1.512851885922e-01
-1.612014419874e-01
--1.437293376985e-02
-2.899547585024e-01
diff --git a/examples/fluids/dd_sgs_data/w1.dat b/examples/fluids/dd_sgs_data/w1.dat
deleted file mode 100644
index f27466d680..0000000000
--- a/examples/fluids/dd_sgs_data/w1.dat
+++ /dev/null
@@ -1,121 +0,0 @@
-120 1
--1.573046615553e-01
--8.451867037896e-02
-1.685678425651e-01
--4.017536901661e-01
-4.068168468515e-01
--1.642360540833e-01
-3.676945195442e-01
-3.470122358933e-02
-8.395344749312e-02
-1.230997497940e-01
-1.211759010593e-02
-2.570394361674e-01
-3.340400131793e-01
-2.342342193371e-01
-3.243180076338e-01
-1.235553459881e-01
--7.501312735230e-03
-1.277232278360e-01
-5.125506452634e-02
-1.844073315749e-02
-3.604786379338e-02
-2.063091161008e-01
-2.406054256905e-01
-1.846018306032e-01
-8.525111135827e-02
-3.795039661101e-01
--5.693426468413e-02
--8.111639981827e-02
-5.813760592106e-02
-1.490767475429e-01
--2.410115518494e-01
-2.173034199970e-01
-1.497734413376e-01
--1.296487298257e-01
-2.078686368723e-01
-3.891951801941e-01
-4.243457269355e-01
-1.735201583344e-02
--8.168373382023e-02
--5.933063216886e-02
-1.886585865778e-01
-1.756465348482e-01
-3.295663670792e-01
-1.056135052370e-01
--2.574613681620e-02
-3.683309291418e-01
-3.263624712033e-01
-8.396039179924e-03
--1.916324382654e-01
--2.628404302745e-01
--4.853315252243e-01
-3.133577858731e-01
--2.225070735939e-01
--9.576593410171e-02
-1.447837018193e-01
-2.479471268180e-01
--2.815934342469e-02
-4.508725076092e-02
-1.597744878041e-01
-3.494916947631e-01
--1.426111236028e-02
--1.950362350157e-01
--1.520383426062e-01
--1.344609935156e-01
--2.834500136985e-01
--1.781729998743e-01
--2.521768488857e-03
-6.124647252338e-02
-1.821655951804e-01
-1.293018729851e-01
--9.152586815194e-02
-1.765147511709e-01
-1.875253937772e-02
-2.166082722554e-01
-2.938824219314e-01
-5.320082811374e-02
-2.741659946012e-02
--2.433400466181e-02
--2.085467015769e-01
--1.562518751071e-01
-1.953718281920e-01
--1.221103203238e-02
--6.595354434769e-03
-1.189039582211e-02
-4.107899017131e-02
--6.139734862958e-02
--1.123938999802e-01
-4.565610032251e-04
--1.740175952284e-01
--1.494514855103e-01
--2.351603953684e-01
-3.606743670982e-02
-8.892241319819e-02
--3.823627099458e-02
--4.888216006000e-02
--2.063411767057e-02
-2.653079299534e-02
-2.183949112550e-01
-2.504859939801e-01
-2.814937134408e-01
-7.415384984914e-01
--1.397471716093e+00
--1.489213014481e+00
--5.251418296160e-01
-1.137088253126e-02
--1.895953497433e+00
-7.674570685028e-02
--4.854041451939e+00
-3.391193043882e+00
-2.707932115838e-01
--2.105949983636e-01
-3.070531949510e+00
--4.339743339556e+00
-1.620039300970e+00
--5.362553981240e+00
--3.777406494431e-01
-6.925621482846e-01
-2.343923900615e+00
--3.371354057465e-01
-2.055037536703e-01
diff --git a/examples/fluids/dd_sgs_data/w2.dat b/examples/fluids/dd_sgs_data/w2.dat
deleted file mode 100644
index e1553a6817..0000000000
--- a/examples/fluids/dd_sgs_data/w2.dat
+++ /dev/null
@@ -1,121 +0,0 @@
-120 1
-1.135291623557e-01
-2.505376613198e-01
--9.772966879924e-02
--3.165730972704e-02
-2.807214492556e-01
-1.905260494013e-01
--2.411145792883e-01
-7.384048966390e-02
--1.125988973598e-01
-2.226653706004e-01
--8.789637173632e-02
-2.422358783658e-02
--1.888415645076e-01
--1.810726479901e-01
--1.820814108385e-01
-2.707856893663e-02
-2.395061686285e-01
-3.132696895911e-01
--3.571137262982e-02
--6.703403319249e-02
--2.135582591703e-02
-1.706671398779e-01
--1.422555292276e-01
--1.599414011627e-01
-5.590818266867e-02
-4.760353849516e-02
--8.990354851525e-02
--2.351533551901e-01
--9.919203877195e-02
--8.730502598066e-02
--1.624083994254e-01
--1.756234871059e-01
-2.155448112826e-03
--2.196716615285e-01
-1.230359555198e-01
--2.487008789866e-01
-1.724010972168e-01
-1.200986779247e-01
--1.356200209136e-01
--7.136175504869e-02
--3.284780361916e-02
--2.809583022011e-01
--4.970314689199e-01
--2.877535188767e-02
--3.486136238658e-01
-1.031508309715e-01
--1.166679199470e-01
-1.560071145323e-01
-2.028477831976e-01
-1.679921757572e-01
-1.107170925328e-01
-3.667441712254e-02
-4.279277543497e-02
-1.742941565737e-01
--3.784073837720e-02
-1.170800846414e-01
--8.476677440525e-02
-1.497150762135e-01
-2.095513599240e-01
-1.824870885809e-01
-4.204566627279e-03
--1.556048882917e-02
-1.383926559619e-02
--3.655393508686e-02
-1.111261215177e-01
--3.069205340750e-04
-3.488581056182e-01
--8.042626832384e-02
-1.033683988755e-01
-5.948803437376e-02
--1.994940978541e-01
-7.096924570423e-03
--5.218607313871e-01
--3.428397293084e-01
--2.293382327216e-01
--1.460950001481e-01
--1.581076721431e-01
--2.289507718293e-01
-9.798627298221e-02
-1.437733340246e-01
-1.419228410529e-01
-1.958229699684e-01
-6.931951694653e-03
--7.136749568601e-02
--4.555582403662e-01
--3.070119242611e-01
--2.470410221827e-01
--7.803738726853e-02
-9.142063556119e-02
--1.368559538361e-02
--1.850283326418e-01
-1.152746119954e-02
-1.638429235964e-01
--1.435165512193e-01
--2.534513849487e-01
--2.984090266181e-01
-2.217432932036e-01
--8.358398540164e-02
-7.406614310444e-02
--5.651017266891e-02
--2.270784064420e-01
--2.302290117375e-01
-9.304265393625e-02
-6.798332878752e-02
-4.431976767864e-02
--1.707610729819e-01
--1.410204520039e-01
-1.327823810929e-01
--6.044012224887e-02
--1.376555083883e-02
--3.025252354651e-03
-1.907005235143e-01
-1.291788250753e-01
-5.697185825588e-02
-5.093944063855e-02
--5.412382470510e-02
-2.268724377069e-01
--7.159129384369e-02
--2.554784469980e-01
--1.335334767520e-01
diff --git a/examples/fluids/include/bc_definition.h b/examples/fluids/include/bc_definition.h
new file mode 100644
index 0000000000..7b5671ab1c
--- /dev/null
+++ b/examples/fluids/include/bc_definition.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <ceed.h>
+#include <petsc.h>
+
+typedef struct _p_BCDefinition *BCDefinition;
+struct _p_BCDefinition {
+  char *name;
+
+  // Boundary ID information
+  PetscInt num_label_values, *label_values, dm_field;
+
+  // Essential Boundary information
+  PetscInt num_essential_comps, *essential_comps;
+};
+
+/**
+   @brief Creates a `BCDefinition` from an array of integers in an option in the database
+
+   Must be between `PetscOptionsBegin()` and `PetscOptionsEnd()`.
+
+   @param[in]  opt    The option one is seeking
+   @param[in]  text   Short string describing option
+   @param[in]  man    Manual page for the option
+   @param[in]  name   String that sets the name of the `BCDefinition`
+   @param[out] bc_def Resulting `BCDefinition`, `NULL` if option is not set
+   @param[out] set    `PETSC_TRUE` if found, else `PETSC_FALSE`
+**/
+#define PetscOptionsBCDefinition(opt, text, man, name, bc_def, set) \
+  PetscOptionsBCDefinition_Private(PetscOptionsObject, opt, text, man, name, bc_def, set)
+PetscErrorCode PetscOptionsBCDefinition_Private(PetscOptionItems PetscOptionsObject, const char opt[], const char text[], const char man[],
+                                                const char name[], BCDefinition *bc_def, PetscBool *set);
+
+PetscErrorCode BCDefinitionCreate(const char *name, PetscInt num_label_values, PetscInt label_values[], BCDefinition *bc_def);
+PetscErrorCode BCDefinitionGetInfo(BCDefinition bc_def, const char *name[], PetscInt *num_label_values, const PetscInt *label_values[]);
+PetscErrorCode BCDefinitionDestroy(BCDefinition *bc_def);
+
+PetscErrorCode BCDefinitionSetEssential(BCDefinition bc_def, PetscInt num_essential_comps, PetscInt essential_comps[]);
+PetscErrorCode BCDefinitionGetEssential(BCDefinition bc_def, PetscInt *num_essential_comps, const PetscInt *essential_comps[]);
diff --git a/examples/fluids/include/log_events.h b/examples/fluids/include/log_events.h
new file mode 100644
index 0000000000..4a70db3b83
--- /dev/null
+++ b/examples/fluids/include/log_events.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <petsc.h>
+
+extern PetscLogEvent FLUIDS_CeedOperatorApply;
+extern PetscLogEvent FLUIDS_CeedOperatorAssemble;
+extern PetscLogEvent FLUIDS_CeedOperatorAssembleDiagonal;
+extern PetscLogEvent FLUIDS_CeedOperatorAssemblePointBlockDiagonal;
+extern PetscLogEvent FLUIDS_SmartRedis_Init;
+extern PetscLogEvent FLUIDS_SmartRedis_Meta;
+extern PetscLogEvent FLUIDS_SmartRedis_Train;
+extern PetscLogEvent FLUIDS_TrainDataCompute;
+extern PetscLogEvent FLUIDS_DifferentialFilter;
+extern PetscLogEvent FLUIDS_VelocityGradientProjection;
+
+PetscErrorCode RegisterLogEvents();
diff --git a/examples/fluids/include/mat-ceed-impl.h b/examples/fluids/include/mat-ceed-impl.h
index f5d5d9ac6a..67a77b7591 100644
--- a/examples/fluids/include/mat-ceed-impl.h
+++ b/examples/fluids/include/mat-ceed-impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,48 +7,20 @@
 #pragma once
 
 #include <ceed.h>
+#include <petsc-ceed.h>
 #include <petscdm.h>
 #include <petscmat.h>
-
-#if defined(__clang_analyzer__)
-#define MATCEED_EXTERN extern
-#elif defined(__cplusplus)
-#define MATCEED_EXTERN extern "C"
-#else
-#define MATCEED_EXTERN extern
-#endif
-
-#if defined(__clang_analyzer__)
-#define MATCEED_INTERN
-#else
-#define MATCEED_INTERN MATCEED_EXTERN __attribute__((visibility("hidden")))
-#endif
-
-/**
-  @brief Calls a libCEED function and then checks the resulting error code.
-  If the error code is non-zero, then a PETSc error is set with the libCEED error message.
-**/
-#ifndef PetscCallCeed
-#define PetscCallCeed(ceed_, ...)                                   \
-  do {                                                              \
-    int ierr_q_ = __VA_ARGS__;                                      \
-    if (ierr_q_ != CEED_ERROR_SUCCESS) {                            \
-      const char *error_message;                                    \
-      CeedGetErrorMessage(ceed_, &error_message);                   \
-      SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "%s", error_message); \
-    }                                                               \
-  } while (0)
-#endif
+#include <petsc/private/petscimpl.h>
 
 // MatCeed context for applying composite CeedOperator on a DM
 typedef struct MatCeedContext_private *MatCeedContext;
 struct MatCeedContext_private {
   Ceed           ceed;
-  char          *name, *internal_mat_type;
+  char          *name, *coo_mat_type;
   PetscMemType   mem_type;
   PetscInt       ref_count, num_mats_assembled_full, num_mats_assembled_pbd;
   PetscBool      is_destroyed, is_ceed_pbd_valid, is_ceed_vpbd_valid;
-  PetscLogEvent  log_event_mult, log_event_mult_transpose;
+  PetscLogEvent  log_event_mult, log_event_mult_transpose, log_event_ceed_mult, log_event_ceed_mult_transpose;
   DM             dm_x, dm_y;
   Mat           *mats_assembled_full, *mats_assembled_pbd, mat_assembled_full_internal, mat_assembled_pbd_internal;
   Vec            X_loc, Y_loc_transpose;
@@ -58,17 +30,18 @@ struct MatCeedContext_private {
 };
 
 // Context data
-MATCEED_INTERN PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_transpose, CeedOperator op_mult,
-                                                   CeedOperator op_mult_transpose, PetscLogEvent log_event_mult,
-                                                   PetscLogEvent log_event_mult_transpose, MatCeedContext *ctx);
-MATCEED_INTERN PetscErrorCode MatCeedContextReference(MatCeedContext ctx);
-MATCEED_INTERN PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *ctx_copy);
-MATCEED_INTERN PetscErrorCode MatCeedContextDestroy(MatCeedContext ctx);
-
-// Mat Ceed
-MATCEED_INTERN PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D);
-MATCEED_INTERN PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y);
-MATCEED_INTERN PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_transpose, CeedOperator op_mult,
+                                                      CeedOperator op_mult_transpose, PetscLogEvent log_event_mult,
+                                                      PetscLogEvent log_event_mult_transpose, PetscLogEvent log_event_ceed_mult,
+                                                      PetscLogEvent log_event_ceed_mult_transpose, MatCeedContext *ctx);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedContextReference(MatCeedContext ctx);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *ctx_copy);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedContextDestroy(MatCeedContext *ctx);
+
+// MatCEED
+PETSC_CEED_EXTERN PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D);
+PETSC_CEED_EXTERN PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y);
+PETSC_CEED_EXTERN PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X);
 
 extern PetscClassId  MATCEED_CLASSID;
 extern PetscLogEvent MATCEED_MULT, MATCEED_MULT_TRANSPOSE;
diff --git a/examples/fluids/include/mat-ceed.h b/examples/fluids/include/mat-ceed.h
index 75a7a612dd..b6a8c08511 100644
--- a/examples/fluids/include/mat-ceed.h
+++ b/examples/fluids/include/mat-ceed.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,38 +7,45 @@
 #pragma once
 
 #include <ceed.h>
+#include <petsc-ceed.h>
 #include <petscdm.h>
 #include <petscmat.h>
 
 #define MATCEED "ceed"
 
-#if defined(__clang_analyzer__)
-#define MATCEED_EXTERN extern
-#elif defined(__cplusplus)
-#define MATCEED_EXTERN extern "C"
-#else
-#define MATCEED_EXTERN extern
-#endif
-
-#if defined(__clang_analyzer__)
-#define MATCEED_INTERN
-#else
-#define MATCEED_INTERN MATCEED_EXTERN __attribute__((visibility("hidden")))
-#endif
-
-// Context data
-MATCEED_INTERN PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat);
-MATCEED_INTERN PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other);
-MATCEED_INTERN PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo);
-MATCEED_INTERN PetscErrorCode MatCeedSetContext(Mat mat, PetscErrorCode (*f)(void *), void *ctx);
-MATCEED_INTERN PetscErrorCode MatCeedGetContext(Mat mat, void *ctx);
-MATCEED_INTERN PetscErrorCode MatCeedSetInnerMatType(Mat mat, MatType type);
-MATCEED_INTERN PetscErrorCode MatCeedGetInnerMatType(Mat mat, MatType *type);
-MATCEED_INTERN PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void));
-MATCEED_INTERN PetscErrorCode MatCeedSetLocalVectors(Mat mat, Vec X_loc, Vec Y_loc_transpose);
-MATCEED_INTERN PetscErrorCode MatCeedGetLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose);
-MATCEED_INTERN PetscErrorCode MatCeedRestoreLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose);
-MATCEED_INTERN PetscErrorCode MatCeedGetCeedOperators(Mat mat, CeedOperator *op_mult, CeedOperator *op_mult_transpose);
-MATCEED_INTERN PetscErrorCode MatCeedRestoreCeedOperators(Mat mat, CeedOperator *op_mult, CeedOperator *op_mult_transpose);
-MATCEED_INTERN PetscErrorCode MatCeedSetLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose);
-MATCEED_INTERN PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose);
+// Core functionality
+PETSC_CEED_EXTERN PetscErrorCode MatCreateCeed(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetAssemblyDataUpdateNeeded(Mat mat_ceed, PetscBool update_needed);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedCreateMatCOO(Mat mat_ceed, Mat *mat_coo);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetPreallocationCOO(Mat mat_ceed, Mat mat_coo);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo);
+
+PETSC_CEED_INTERN PetscErrorCode MatCeedSetContextDouble(Mat mat, const char *name, double value);
+PETSC_CEED_INTERN PetscErrorCode MatCeedGetContextDouble(Mat mat, const char *name, double *value);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetContextReal(Mat mat, const char *name, PetscReal value);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetContextReal(Mat mat, const char *name, PetscReal *value);
+PETSC_CEED_INTERN PetscErrorCode MatCeedSetTime(Mat mat, PetscReal time);
+PETSC_CEED_INTERN PetscErrorCode MatCeedGetTime(Mat mat, PetscReal *time);
+PETSC_CEED_INTERN PetscErrorCode MatCeedSetDt(Mat mat, PetscReal dt);
+PETSC_CEED_INTERN PetscErrorCode MatCeedSetShifts(Mat mat, PetscReal shift_v, PetscReal shift_a);
+
+// Advanced functionality
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetContext(Mat mat, PetscCtxDestroyFn f, void *ctx);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetContext(Mat mat, void *ctx);
+
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void));
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetCOOMatType(Mat mat, MatType type);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetCOOMatType(Mat mat, MatType *type);
+
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetLocalVectors(Mat mat, Vec X_loc, Vec Y_loc_transpose);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedRestoreLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose);
+
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetCeedOperators(Mat mat, CeedOperator *op_mult, CeedOperator *op_mult_transpose);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedRestoreCeedOperators(Mat mat, CeedOperator *op_mult, CeedOperator *op_mult_transpose);
+
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedSetCeedOperatorLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose);
+PETSC_CEED_EXTERN PetscErrorCode MatCeedGetCeedOperatorLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose);
diff --git a/examples/fluids/include/ceed-utils.h b/examples/fluids/include/petsc-ceed-utils.h
similarity index 84%
rename from examples/fluids/include/ceed-utils.h
rename to examples/fluids/include/petsc-ceed-utils.h
index b7962b0f5c..54b61610cb 100644
--- a/examples/fluids/include/ceed-utils.h
+++ b/examples/fluids/include/petsc-ceed-utils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -9,15 +9,43 @@
 #include <ceed.h>
 #include <petscdm.h>
 
-#define PetscCallCeed(ceed, ...)                                    \
-  do {                                                              \
-    int ierr = __VA_ARGS__;                                         \
-    if (ierr != CEED_ERROR_SUCCESS) {                               \
-      const char *error_message;                                    \
-      CeedGetErrorMessage(ceed, &error_message);                    \
-      SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "%s", error_message); \
-    }                                                               \
-  } while (0)
+/**
+  @brief Copy the reference to a `Vec`.
+         Note: If `vec_copy` is non-null, it is assumed to be a valid pointer to a `Vec` and `VecDestroy()` will be called.
+
+  Collective across MPI processes.
+
+  @param[in]   vec       `Vec` to reference
+  @param[out]  vec_copy  Copy of reference
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+static inline PetscErrorCode VecReferenceCopy(Vec vec, Vec *vec_copy) {
+  PetscFunctionBeginUser;
+  PetscCall(PetscObjectReference((PetscObject)vec));
+  PetscCall(VecDestroy(vec_copy));
+  *vec_copy = vec;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Copy the reference to a `DM`.
+         Note: If `dm_copy` is non-null, it is assumed to be a valid pointer to a `DM` and `DMDestroy()` will be called.
+
+  Collective across MPI processes.
+
+  @param[in]   dm       `DM` to reference
+  @param[out]  dm_copy  Copy of reference
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+static inline PetscErrorCode DMReferenceCopy(DM dm, DM *dm_copy) {
+  PetscFunctionBeginUser;
+  PetscCall(PetscObjectReference((PetscObject)dm));
+  PetscCall(DMDestroy(dm_copy));
+  *dm_copy = dm;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
 
 /**
   @brief Translate PetscMemType to CeedMemType
@@ -192,6 +220,7 @@ static inline PetscErrorCode VecCopyPetscToCeed(Vec X_petsc, CeedVector x_ceed)
   PetscCall(VecGetArrayReadAndMemType(X_petsc, (const PetscScalar **)&x, &mem_type));
   PetscCallCeed(ceed, CeedVectorSetArray(x_ceed, MemTypePetscToCeed(mem_type), CEED_COPY_VALUES, x));
   PetscCall(VecRestoreArrayReadAndMemType(X_petsc, (const PetscScalar **)&x));
+  PetscCheck(CeedDestroy(&ceed) == CEED_ERROR_SUCCESS, PetscObjectComm((PetscObject)X_petsc), PETSC_ERR_LIB, "Destroying Ceed object failed");
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
diff --git a/examples/fluids/include/petsc-ceed.h b/examples/fluids/include/petsc-ceed.h
new file mode 100644
index 0000000000..3b3d648d15
--- /dev/null
+++ b/examples/fluids/include/petsc-ceed.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <petscsys.h>
+
+#if defined(__clang_analyzer__)
+#define PETSC_CEED_EXTERN extern
+#elif defined(__cplusplus)
+#define PETSC_CEED_EXTERN extern "C"
+#else
+#define PETSC_CEED_EXTERN extern
+#endif
+
+#if defined(__clang_analyzer__)
+#define PETSC_CEED_INTERN
+#else
+#define PETSC_CEED_INTERN PETSC_CEED_EXTERN __attribute__((visibility("hidden")))
+#endif
+
+/**
+  @brief Calls a libCEED function and then checks the resulting error code.
+  If the error code is non-zero, then a PETSc error is set with the libCEED error message.
+**/
+/// @ingroup RatelInternal
+#ifndef PetscCallCeed
+#define PetscCallCeed(ceed_, ...)                                   \
+  do {                                                              \
+    int ierr_q_;                                                    \
+    PetscStackUpdateLine;                                           \
+    ierr_q_ = __VA_ARGS__;                                          \
+    if (PetscUnlikely(ierr_q_ != CEED_ERROR_SUCCESS)) {             \
+      const char *error_message;                                    \
+      CeedGetErrorMessage(ceed_, &error_message);                   \
+      SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "%s", error_message); \
+    }                                                               \
+  } while (0)
+#endif
diff --git a/examples/fluids/include/petsc_ops.h b/examples/fluids/include/petsc_ops.h
index 9913780172..d614df60ab 100644
--- a/examples/fluids/include/petsc_ops.h
+++ b/examples/fluids/include/petsc_ops.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/include/smartsim.h b/examples/fluids/include/smartsim.h
deleted file mode 100644
index f8ba943e5f..0000000000
--- a/examples/fluids/include/smartsim.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-#pragma once
-
-#include <c_client.h>
-#include <petscsys.h>
-#include <sr_enums.h>
-
-#if defined(__clang_analyzer__)
-void PetscCallSmartRedis(SRError);
-#else
-#define PetscCallSmartRedis(...)                                                                                                   \
-  do {                                                                                                                             \
-    SRError   ierr_smartredis_call_q_;                                                                                             \
-    PetscBool disable_calls = PETSC_FALSE;                                                                                         \
-    PetscStackUpdateLine;                                                                                                          \
-    PetscCall(PetscOptionsGetBool(NULL, NULL, "-smartsim_disable_calls", &disable_calls, NULL));                                   \
-    if (disable_calls == PETSC_TRUE) break;                                                                                        \
-    ierr_smartredis_call_q_ = __VA_ARGS__;                                                                                         \
-    if (PetscUnlikely(ierr_smartredis_call_q_ != SRNoError))                                                                       \
-      SETERRQ(PETSC_COMM_SELF, (PetscErrorCode)ierr_smartredis_call_q_, "SmartRedis Error (Code %d): %s", ierr_smartredis_call_q_, \
-              SRGetLastError());                                                                                                   \
-  } while (0)
-#endif
-
-PetscErrorCode SmartRedisVerifyPutTensor(void *c_client, const char *name, const size_t name_length);
diff --git a/examples/fluids/index.md b/examples/fluids/index.md
index 9c53bef0f3..27fe5b9a3c 100644
--- a/examples/fluids/index.md
+++ b/examples/fluids/index.md
@@ -9,7 +9,7 @@ Moreover, the Navier-Stokes example has been developed using PETSc, so that the
 ## Running the mini-app
 
 ```{include} README.md
-:start-after: inclusion-fluids-marker
+:start-after: <!-- fluids-inclusion -->
 ```
 ## The Navier-Stokes equations
 
@@ -317,65 +317,111 @@ Note that this wave speed is specific to ideal gases as $\gamma$ is an ideal gas
 Currently, this demo provides three types of problems/physical models that can be selected at run time via the option `-problem`.
 {ref}`problem-advection`, the problem of the transport of energy in a uniform vector velocity field, {ref}`problem-euler-vortex`, the exact solution to the Euler equations, and the so called {ref}`problem-density-current` problem.
 
-### Subgrid Stress Modeling
+### Statistics Collection
+For scale-resolving simulations (such as LES and DNS), statistics for a simulation are more often useful than time-instantaneous snapshots of the simulation itself.
+To make this process more computationally efficient, averaging in the spanwise direction, if physically correct, can help reduce the amount of simulation time needed to get converged statistics.
 
-When a fluid simulation is under-resolved (the smallest length scale resolved by the grid is much larger than the smallest physical scale, the [Kolmogorov length scale](https://en.wikipedia.org/wiki/Kolmogorov_microscales)), this is mathematically interpreted as filtering the Navier-Stokes equations.
-This is known as large-eddy simulation (LES), as only the "large" scales of turbulence are resolved.
-This filtering operation results in an extra stress-like term, $\bm{\tau}^r$, representing the effect of unresolved (or "subgrid" scale) structures in the flow.
-Denoting the filtering operation by $\overline \cdot$, the LES governing equations are:
+First, let's more precisely define what we mean by spanwise average.
+Denote $\langle \phi \rangle$ as the Reynolds average of $\phi$, which in this case would be a average over the spanwise direction and time:
 
 $$
-\frac{\partial \bm{\overline q}}{\partial t} + \nabla \cdot \bm{\overline F}(\bm{\overline q}) -S(\bm{\overline q}) = 0 \, ,
-$$ (eq-vector-les)
+\langle \phi \rangle(x,y) = \frac{1}{L_z + (T_f - T_0)}\int_0^{L_z} \int_{T_0}^{T_f} \phi(x, y, z, t) \mathrm{d}t \mathrm{d}z
+$$
 
-where
+where $z$ is the spanwise direction, the domain has size $[0, L_z]$ in the spanwise direction, and $[T_0, T_f]$ is the range of time being averaged over.
+Note that here and in the code, **we assume the spanwise direction to be in the $z$ direction**.
+
+To discuss the details of the implementation we'll first discuss the spanwise integral, then the temporal integral, and lastly the statistics themselves.
+
+#### Spanwise Integral
+The function $\langle \phi \rangle (x,y)$ is represented on a 2-D finite element grid, taken from the full domain mesh itself.
+If isoperiodicity is set, the periodic face is extracted as the spanwise statistics mesh.
+Otherwise the negative z face is used.
+We'll refer to this mesh as the *parent grid*, as for every "parent" point in the parent grid, there are many "child" points in the full domain.
+Define a function space on the parent grid as $\mathcal{V}_p^\mathrm{parent} = \{ \bm v(\bm x) \in H^{1}(\Omega_e^\mathrm{parent}) \,|\, \bm v(\bm x_e(\bm X)) \in P_p(\bm{I}), e=1,\ldots,N_e \}$.
+We enforce that the order of the parent FEM space is equal to the full domain's order.
+
+Many statistics are the product of 2 or more solution functions, which results in functions of degree higher than the parent FEM space, $\mathcal{V}_p^\mathrm{parent}$.
+To represent these higher-order functions on the parent FEM space, we perform an $L^2$ projection.
+Define the spanwise averaged function as:
 
 $$
-\bm{\overline F}(\bm{\overline q}) =
-\bm{F} (\bm{\overline q}) +
-\begin{pmatrix}
-    0\\
-     \bm{\tau}^r \\
-     \bm{u}  \cdot \bm{\tau}^r
-\end{pmatrix}
-$$ (eq-les-flux)
-
-More details on deriving the above expression, filtering, and large eddy simulation can be found in {cite}`popeTurbulentFlows2000`.
-To close the problem, the subgrid stress must be defined.
-For implicit LES, the subgrid stress is set to zero and the numerical properties of the discretized system are assumed to account for the effect of subgrid scale structures on the filtered solution field.
-For explicit LES, it is defined by a subgrid stress model.
-
-(sgs-dd-model)=
-#### Data-driven SGS Model
-
-The data-driven SGS model implemented here uses a small neural network to compute the SGS term.
-The SGS tensor is calculated at nodes using an $L^2$ projection of the velocity gradient and grid anisotropy tensor, and then interpolated onto quadrature points.
-More details regarding the theoretical background of the model can be found in {cite}`prakashDDSGS2022` and {cite}`prakashDDSGSAnisotropic2022`.
-
-The neural network itself consists of 1 hidden layer and 20 neurons, using Leaky ReLU as its activation function.
-The slope parameter for the Leaky ReLU function is set via `-sgs_model_dd_leakyrelu_alpha`.
-The outputs of the network are assumed to be normalized on a min-max scale, so they must be rescaled by the original min-max bounds.
-Parameters for the neural network are put into files in a directory found in `-sgs_model_dd_parameter_dir`.
-These files store the network weights (`w1.dat` and `w2.dat`), biases (`b1.dat` and `b2.dat`), and scaling parameters (`OutScaling.dat`).
-The first row of each files stores the number of columns and rows in each file.
-Note that the weight coefficients are assumed to be in column-major order.
-This is done to keep consistent with legacy file compatibility.
+\langle \phi \rangle_z(x,y,t) = \frac{1}{L_z} \int_0^{L_z} \phi(x, y, z, t) \mathrm{d}z
+$$
 
-:::{note}
-The current data-driven model parameters are not accurate and are for regression testing only.
-:::
+where the function $\phi$ may be the product of multiple solution functions and $\langle \phi \rangle_z$ denotes the spanwise average.
+The projection of a function $u$ onto the parent FEM space would look like:
+
+$$
+\bm M u_N = \int_0^{L_x} \int_0^{L_y} u \psi^\mathrm{parent}_N \mathrm{d}y \mathrm{d}x
+$$
+where $\bm M$ is the mass matrix for $\mathcal{V}_p^\mathrm{parent}$, $u_N$ the coefficients of the projected function, and $\psi^\mathrm{parent}_N$ the basis functions of the parent FEM space.
+Substituting the spanwise average of $\phi$ for $u$, we get:
+
+$$
+\bm M [\langle \phi \rangle_z]_N = \int_0^{L_x} \int_0^{L_y} \left [\frac{1}{L_z} \int_0^{L_z} \phi(x,y,z,t) \mathrm{d}z \right ] \psi^\mathrm{parent}_N(x,y) \mathrm{d}y \mathrm{d}x
+$$
 
-##### Data-driven Model Using External Libraries
+The triple integral in the right hand side is just an integral over the full domain
 
-There are two different modes for using the data-driven model: fused and sequential.
+$$
+\bm M [\langle \phi \rangle_z]_N = \frac{1}{L_z} \int_\Omega \phi(x,y,z,t) \psi^\mathrm{parent}_N(x,y) \mathrm{d}\Omega
+$$
+
+We need to evaluate $\psi^\mathrm{parent}_N$ at quadrature points in the full domain.
+To do this efficiently, **we assume and exploit the full domain grid to be a tensor product in the spanwise direction**.
+This assumption means quadrature points in the full domain have the same $(x,y)$ coordinate location as quadrature points in the parent domain.
+This also allows the use of the full domain quadrature weights for the triple integral.
 
-In fused mode, the input processing, model inference, and output handling were all done in a single CeedOperator.
-Conversely, sequential mode has separate function calls/CeedOperators for input creation, model inference, and output handling.
-By separating the three steps to the model evaluation, the sequential mode allows for functions calling external libraries to be used for the model inference step.
-This however is slower than the fused kernel, but this requires a native libCEED inference implementation.
+#### Temporal Integral/Averaging
+To calculate the temporal integral, we do a running average using left-rectangle rule.
+At the beginning of each simulation, the time integral of a statistic is set to 0, $\overline{\phi} = 0$.
+Periodically, the integral is updated using left-rectangle rule:
 
-To use the fused mode, set `-sgs_model_dd_use_fused true`.
-To use the sequential mode, set the same flag to `false`.
+$$\overline{\phi}_\mathrm{new} = \overline{\phi}_{\mathrm{old}} + \phi(t_\mathrm{new}) \Delta T$$
+where $\phi(t_\mathrm{new})$ is the statistic at the current time and $\Delta T$ is the time since the last update.
+When stats are written out to file, this running sum is then divided by $T_f - T_0$ to get the time average.
+
+With this method of calculating the running time average, we can plug this into the $L^2$ projection of the spanwise integral:
+
+$$
+\bm M [\langle \phi \rangle]_N = \frac{1}{L_z + (T_f - T_0)} \int_\Omega \int_{T_0}^{T_f} \phi(x,y,z,t) \psi^\mathrm{parent}_N \mathrm{d}t \mathrm{d}\Omega
+$$
+where the integral $\int_{T_0}^{T_f} \phi(x,y,z,t) \mathrm{d}t$ is calculated on a running basis.
+
+
+#### Running
+As the simulation runs, it takes a running time average of the statistics at the full domain quadrature points.
+This running average is only updated at the interval specified by `-ts_monitor_turbulence_spanstats_collect_interval` as number of timesteps.
+The $L^2$ projection problem is only solved when statistics are written to file, which is controlled by `-ts_monitor_turbulence_spanstats_viewer_interval`.
+Note that the averaging is not reset after each file write.
+The average is always over the bounds $[T_0, T_f]$, where $T_f$ in this case would be the time the file was written at and $T_0$ is the solution time at the beginning of the run.
+
+#### Turbulent Statistics
+
+The focus here are those statistics that are relevant to turbulent flow.
+The terms collected are listed below, with the mathematical definition on the left and the label (present in CGNS output files) is on the right.
+
+| Math                           | Label                           |
+| -----------------              | --------                        |
+| $\langle \rho \rangle$         | MeanDensity                     |
+| $\langle p \rangle$            | MeanPressure                    |
+| $\langle p^2 \rangle$          | MeanPressureSquared             |
+| $\langle p u_i \rangle$        | MeanPressureVelocity[$i$]       |
+| $\langle \rho T \rangle$       | MeanDensityTemperature          |
+| $\langle \rho T u_i \rangle$   | MeanDensityTemperatureFlux[$i$] |
+| $\langle \rho u_i \rangle$     | MeanMomentum[$i$]               |
+| $\langle \rho u_i u_j \rangle$ | MeanMomentumFlux[$ij$]          |
+| $\langle u_i \rangle$          | MeanVelocity[$i$]               |
+
+where [$i$] are suffixes to the labels. So $\langle \rho u_x u_y \rangle$ would correspond to MeanMomentumFluxXY.
+This naming convention attempts to mimic the CGNS standard.
+
+To get second-order statistics from these terms, simply use the identity:
+
+$$
+\langle \phi' \theta' \rangle = \langle \phi \theta \rangle - \langle \phi \rangle \langle \theta \rangle
+$$
 
 (differential-filtering)=
 ### Differential Filtering
@@ -470,42 +516,6 @@ To match the "size" of a normal kernel to our differential kernel, we attempt to
 To match the box and Gaussian filters "sizes", we use $\beta = 1/10$ and $\beta = 1/6$, respectively.
 $\beta$ can be set via `-diff_filter_kernel_scaling`.
 
-### *In Situ* Machine-Learning Model Training
-Training machine-learning models normally uses *a priori* (already gathered) data stored on disk.
-This is computationally inefficient, particularly as the scale of the problem grows and the data that is saved to disk reduces to a small percentage of the total data generated by a simulation.
-One way of working around this to to train a model on data coming from an ongoing simulation, known as *in situ* (in place) learning.
-
-This is implemented in the code using [SmartSim](https://www.craylabs.org/docs/overview.html).
-Briefly, the fluid simulation will periodically place data for training purposes into a database that a separate process uses to train a model.
-The database used by SmartSim is [Redis](https://redis.com/modules/redis-ai/) and the library to connect to the database is called [SmartRedis](https://www.craylabs.org/docs/smartredis.html).
-More information about how to utilize this code in a SmartSim configuration can be found on [SmartSim's website](https://www.craylabs.org/docs/overview.html).
-
-To use this code in a SmartSim *in situ* setup, first the code must be built with SmartRedis enabled.
-This is done by specifying the installation directory of SmartRedis using the `SMARTREDIS_DIR` environment variable when building:
-
-```
-make SMARTREDIS_DIR=~/software/smartredis/install
-```
-
-#### SGS Data-Driven Model *In Situ* Training
-Currently the code is only setup to do *in situ* training for the SGS data-driven model.
-Training data is split into the model inputs and outputs.
-The model inputs are calculated as the same model inputs in the SGS Data-Driven model described {ref}`earlier<sgs-dd-model>`.
-The model outputs (or targets in the case of training) are the subgrid stresses.
-Both the inputs and outputs are computed from a filtered velocity field, which is calculated via {ref}`differential-filtering`.
-The settings for the differential filtering used during training are described in {ref}`differential-filtering`.
-The training will create multiple sets of data per each filter width defined in `-sgs_train_filter_widths`.
-Those scalar filter widths correspond to the scaling correspond to $\bm{D} = c \bm{I}$, where $c$ is the scalar filter width.
-
-The SGS *in situ* training can be enabled using the `-sgs_train_enable` flag.
-Data can be processed and placed into the database periodically.
-The interval between is controlled by `-sgs_train_write_data_interval`.
-There's also the choice of whether to add new training data on each database write or to overwrite the old data with new data.
-This is controlled by `-sgs_train_overwrite_data`.
-
-The database may also be located on the same node as a MPI rank (collocated) or located on a separate node (distributed).
-It's necessary to know how many ranks are associated with each collocated database, which is set by `-smartsim_collocated_database_num_ranks`.
-
 (problem-advection)=
 ## Advection-Diffusion
 
@@ -855,20 +865,17 @@ numerous terms in the STG formulation.
 
 #### Internal Damping Layer (IDL)
 The STG inflow boundary condition creates large amplitude acoustic waves.
-We use an internal damping layer (IDL) to damp them out without disrupting the synthetic structures developing into natural turbulent structures. This implementation was inspired from
-{cite}`shurSTG2014`, but is implemented here as a ramped volumetric forcing
-term, similar to a sponge layer (see 8.4.2.4 in {cite}`colonius2023turbBC` for example). It takes the following form:
+We use an internal damping layer (IDL) to damp them out without disrupting the synthetic structures developing into natural turbulent structures.
+This implementation was inspired by {cite}`shurSTG2014`, but is implemented here as a ramped volumetric forcing term, similar to a sponge layer (see 8.4.2.4 in {cite}`colonius2023turbBC` for example).
+It takes the following form:
 
 $$
 S(\bm{q}) = -\sigma(\bm{x})\left.\frac{\partial \bm{q}}{\partial \bm{Y}}\right\rvert_{\bm{q}} \bm{Y}'
 $$
 
-where $\bm{Y}' = [P - P_\mathrm{ref}, \bm{0}, 0]^T$, and $\sigma(\bm{x})$ is a
-linear ramp starting at `-idl_start` with length `-idl_length` and an amplitude
-of inverse `-idl_decay_rate`. The damping is defined in terms of a pressure-primitive
-anomaly $\bm Y'$ converted to conservative source using $\partial
-\bm{q}/\partial \bm{Y}\rvert_{\bm{q}}$, which is linearized about the current
-flow state. $P_\mathrm{ref}$ is defined via the `-reference_pressure` flag.
+where $\bm{Y}' = [P - P_\mathrm{ref}, \bm{0}, 0]^T$, and $\sigma(\bm{x})$ is a linear ramp starting at `-idl_start` with length `-idl_length` and an amplitude of inverse `-idl_decay_rate`.
+The damping is defined in terms of a pressure-primitive anomaly $\bm Y'$ converted to conservative source using $\partial \bm{q}/\partial \bm{Y}\rvert_{\bm{q}}$, which is linearized about the current flow state.
+$P_\mathrm{ref}$ has a default value equal to `-reference_pressure` flag, with an optional flag `-idl_pressure` to set it to a different value.
 
 ### Meshing
 
diff --git a/examples/fluids/navierstokes.c b/examples/fluids/navierstokes.c
index 5741119dde..0b674bd660 100644
--- a/examples/fluids/navierstokes.c
+++ b/examples/fluids/navierstokes.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -18,18 +18,18 @@
 //     ./navierstokes -ceed /cpu/self -options_file gaussianwave.yml
 //     ./navierstokes -ceed /gpu/cuda -problem advection -degree 1
 //
-//TESTARGS(name="Gaussian Wave, explicit, supg") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -implicit false -ts_type rk -stab supg -state_var conservative -mass_ksp_type gmres -mass_pc_jacobi_type diagonal
+//TESTARGS(name="Newtonian and Riemann Solver Unit Tests",only="cpu") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e100 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin -dm_plex_box_faces 5,5,1 -ts_max_steps 0 -newtonian_unit_tests -riemann_solver_unit_tests
+//TESTARGS(name="Gaussian Wave, IDL and Entropy variables") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin -state_var entropy -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70
+//TESTARGS(name="Gaussian Wave, explicit, supg, IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -implicit false -ts_type rk -stab supg -state_var conservative -mass_ksp_type gmres -mass_pc_jacobi_type diagonal -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -idl_pressure 70
 //TESTARGS(name="Advection 2D, rotation, explicit, supg, consistent mass") -ceed {ceed_resource} -test_type solver -problem advection -degree 3 -dm_plex_box_faces 2,2 -dm_plex_box_lower 0,0 -dm_plex_box_upper 125,125 -bc_wall 1,2,3,4 -wall_comps 4 -units_kilogram 1e-9 -rc 100. -ts_dt 1e-3 -ts_max_steps 10 -stab supg -Ctaus 0.5 -mass_ksp_type gmres -mass_pc_type vpbjacobi -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv2d-rotation-explicit-stab-supg-consistent-mass.bin
-//TESTARGS(name="Advection, skew") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 5 -wind_type translation -wind_translation -0.5547002,0.83205029,0 -advection_ic_type skew  -dm_plex_box_faces 2,1,1 -degree 2 -stab supg -stab_tau advdiff_shakib -Ctau_a 4 -ksp_type gmres -diffusion_coeff 5e-4 -compare_final_state_atol 5e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-skew.bin
+//TESTARGS(name="Advection, skew") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 5 -wind_type translation -wind_translation -0.5547002,0.83205029,0 -advection_ic_type skew  -dm_plex_box_faces 2,1,1 -degree 2 -stab supg -stab_tau advdiff_shakib -Ctau_a 4 -ksp_type gmres -diffusion_coeff 5e-4 -compare_final_state_atol 7e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-skew.bin
 //TESTARGS(name="Blasius, bc_slip") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/blasius.yaml -ts_max_steps 5 -dm_plex_box_faces 3,20,1 -platemesh_nDelta 10 -platemesh_growth 1.2 -bc_outflow 5 -bc_slip 4 -compare_final_state_atol 2E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-bc_slip.bin
-//TESTARGS(name="Blasius, SGS DataDriven Sequential") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin -sgs_model_dd_use_fused false
 //TESTARGS(name="Advection, rotation, cosine") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/advection.yaml -ts_max_steps 0 -advection_ic_type cosine_hill -dm_plex_box_faces 2,1,1 -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-adv-rotation-cosine.bin
 //TESTARGS(name="Gaussian Wave, using MatShell") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 1e-8 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin -dm_plex_box_faces 2,2,1 -ts_max_steps 5 -degree 3 -amat_type shell -pc_type vpbjacobi -ts_alpha_radius 0.5
 //TESTARGS(name="Taylor-Green Vortex IC") -ceed {ceed_resource} -problem taylor_green -test_type solver -dm_plex_dim 3 -dm_plex_box_faces 6,6,6 -ts_max_steps 0 -compare_final_state_atol 1e-12 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-taylor-green-IC.bin
-//TESTARGS(name="Blasius, SGS DataDriven Fused") -ceed {ceed_resource} -options_file examples/fluids/tests-output/blasius_stgtest.yaml -sgs_model_type data_driven -sgs_model_dd_leakyrelu_alpha 0.3 -sgs_model_dd_parameter_dir examples/fluids/dd_sgs_data -ts_dt 2e-9 -state_var primitive -ksp_rtol 1e-12 -snes_rtol 1e-12 -stg_mean_only -stg_fluctuating_IC -test_type solver -compare_final_state_atol 1e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius-sgs-data-driven.bin
 //TESTARGS(name="Blasius, Anisotropic Differential Filter") -ceed {ceed_resource} -test_type diff_filter -options_file examples/fluids/tests-output/blasius_test.yaml -compare_final_state_atol 5e-10 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius_diff_filter_aniso_vandriest.bin -diff_filter_monitor -ts_max_steps 0 -state_var primitive -diff_filter_friction_length 1e-5 -diff_filter_wall_damping_function van_driest -diff_filter_ksp_rtol 1e-8 -diff_filter_grid_based_width -diff_filter_width_scaling 1,0.7,1
 //TESTARGS(name="Blasius, Isotropic Differential Filter") -ceed {ceed_resource} -test_type diff_filter -options_file examples/fluids/tests-output/blasius_test.yaml -compare_final_state_atol 2e-12 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius_diff_filter_iso.bin -diff_filter_monitor -ts_max_steps 0 -diff_filter_width_scaling 4.2e-5,4.2e-5,4.2e-5 -diff_filter_ksp_atol 1e-14 -diff_filter_ksp_rtol 1e-16
-//TESTARGS(name="Gaussian Wave, with IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -ts_alpha_radius 0.5
+//TESTARGS(name="Gaussian Wave, with IDL") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/gaussianwave.yaml -compare_final_state_atol 2e-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin -dm_plex_box_faces 5,5,1 -ts_max_steps 5 -idl_decay_time 2e-3 -idl_length 0.25 -idl_start 0 -ts_alpha_radius 0.5 -idl_pressure 70
 //TESTARGS(name="Spanwise Turbulence Statistics") -ceed {ceed_resource} -test_type turb_spanstats -options_file examples/fluids/tests-output/stats_test.yaml -compare_final_state_atol 1E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-turb-spanstats-stats.bin
 //TESTARGS(name="Blasius") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/tests-output/blasius_test.yaml -compare_final_state_atol 2E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius.bin
 //TESTARGS(name="Blasius, STG Inflow") -ceed {ceed_resource} -test_type solver -options_file examples/fluids/tests-output/blasius_stgtest.yaml -compare_final_state_atol 2E-11 -compare_final_state_filename examples/fluids/tests-output/fluids-navierstokes-blasius_STG.bin
@@ -89,10 +89,10 @@ int main(int argc, char **argv) {
   Units units;
   PetscCall(PetscCalloc1(1, &units));
 
-  user->app_ctx        = app_ctx;
-  user->units          = units;
-  user->phys           = phys_ctx;
-  problem->bc_from_ics = PETSC_TRUE;
+  user->app_ctx            = app_ctx;
+  user->units              = units;
+  user->phys               = phys_ctx;
+  problem->set_bc_from_ics = PETSC_TRUE;
 
   PetscCall(RegisterLogEvents());
 
@@ -106,6 +106,7 @@ int main(int argc, char **argv) {
   MPI_Comm comm = PETSC_COMM_WORLD;
   user->comm    = comm;
   PetscCall(ProcessCommandLineOptions(comm, app_ctx, bc));
+  PetscCall(BoundaryConditionSetUp(user, problem, app_ctx, bc));
 
   // ---------------------------------------------------------------------------
   // Initialize libCEED
@@ -208,7 +209,7 @@ int main(int argc, char **argv) {
   //    We use this for the main simulation DM because the reference DMPlexInsertBoundaryValues() is very slow on the GPU due to extra device-to-host
   //    communication. If we disable this, we should still get the same results due to the problem->bc function, but with potentially much slower
   //    execution.
-  if (problem->bc_from_ics) {
+  if (problem->set_bc_from_ics) {
     PetscCall(SetBCsFromICs(dm, Q, user->Q_loc));
   }
 
@@ -229,9 +230,6 @@ int main(int argc, char **argv) {
     PetscCall(SetupICsFromBinary(comm, app_ctx, Q));
   }
 
-  // Print problem summary
-  if (app_ctx->test_type == TESTTYPE_NONE) PetscCall(PrintRunInfo(user, phys_ctx, problem, comm));
-
   // -- Zero Q_loc
   PetscCall(VecZeroEntries(user->Q_loc));
 
@@ -240,7 +238,7 @@ int main(int argc, char **argv) {
   // ---------------------------------------------------------------------------
   TS          ts;
   PetscScalar final_time;
-  PetscCall(TSSolve_NS(dm, user, app_ctx, phys_ctx, &Q, &final_time, &ts));
+  PetscCall(TSSolve_NS(dm, user, app_ctx, phys_ctx, problem, &Q, &final_time, &ts));
 
   // ---------------------------------------------------------------------------
   // Post-processing
@@ -253,10 +251,7 @@ int main(int argc, char **argv) {
 
   PetscCall(TurbulenceStatisticsDestroy(user, ceed_data));
   PetscCall(NodalProjectionDataDestroy(user->grad_velo_proj));
-  PetscCall(SgsDDDataDestroy(user->sgs_dd_data));
   PetscCall(DifferentialFilterDataDestroy(user->diff_filter));
-  PetscCall(SGS_DD_TrainingDataDestroy(user->sgs_dd_train));
-  PetscCall(SmartSimDataDestroy(user->smartsim));
 
   // -- Vectors
   PetscCallCeed(ceed, CeedVectorDestroy(&ceed_data->x_coord));
@@ -268,9 +263,6 @@ int main(int argc, char **argv) {
   // -- Bases
   PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_q));
   PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_x));
-  PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_xc));
-  PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_q_sur));
-  PetscCallCeed(ceed, CeedBasisDestroy(&ceed_data->basis_x_sur));
 
   // -- Restrictions
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&ceed_data->elem_restr_q));
@@ -288,30 +280,14 @@ int main(int argc, char **argv) {
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_freestream_jacobian.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_slip.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_slip_jacobian.qfunction_context));
-    PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->setup_sur.qfunction_context));
-    PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->setup_vol.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->ics.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_vol_rhs.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_vol_ifunction.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->apply_vol_ijacobian.qfunction_context));
   }
 
-  // -- QFunctions
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_setup_vol));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_ics));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_rhs_vol));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_ifunction_vol));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_setup_sur));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_inflow));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_inflow_jacobian));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_freestream));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&ceed_data->qf_apply_freestream_jacobian));
-
   // -- Operators
-  PetscCallCeed(ceed, CeedOperatorDestroy(&ceed_data->op_setup_vol));
   PetscCall(OperatorApplyContextDestroy(ceed_data->op_ics_ctx));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&user->op_rhs_vol));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&user->op_ifunction_vol));
   PetscCall(OperatorApplyContextDestroy(user->op_rhs_ctx));
   PetscCall(OperatorApplyContextDestroy(user->op_strong_bc_ctx));
   PetscCallCeed(ceed, CeedOperatorDestroy(&user->op_ifunction));
@@ -353,8 +329,13 @@ int main(int argc, char **argv) {
   PetscCall(PetscFree(app_ctx->amat_type));
   PetscCall(PetscFree(app_ctx->wall_forces.walls));
   PetscCall(PetscViewerDestroy(&app_ctx->wall_forces.viewer));
+  PetscCall(PetscViewerDestroy(&app_ctx->turb_spanstats_viewer));
 
   // -- Structs
+  for (PetscInt i = 0; i < problem->num_bc_defs; i++) {
+    PetscCall(BCDefinitionDestroy(&problem->bc_defs[i]));
+  }
+  PetscCall(PetscFree(problem->bc_defs));
   PetscCall(PetscFree(units));
   PetscCall(PetscFree(user));
   PetscCall(PetscFree(problem));
diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h
index 49795d2b5f..26ba140814 100644
--- a/examples/fluids/navierstokes.h
+++ b/examples/fluids/navierstokes.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -6,17 +6,19 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-#include <ceed-utils.h>
 #include <ceed.h>
+#include <bc_definition.h>
+#include <log_events.h>
 #include <mat-ceed.h>
+#include <petsc-ceed-utils.h>
 #include <petscts.h>
 #include <stdbool.h>
 
 #include "./include/petsc_ops.h"
 #include "qfunctions/newtonian_types.h"
 
-#if PETSC_VERSION_LT(3, 21, 0)
-#error "PETSc v3.21 or later is required"
+#if PETSC_VERSION_LT(3, 23, 0)
+#error "PETSc v3.23 or later is required"
 #endif
 
 // -----------------------------------------------------------------------------
@@ -32,23 +34,22 @@ typedef enum {
   EULER_TEST_4                 = 4,
   EULER_TEST_5                 = 5,
 } EulerTestType;
-static const char *const EulerTestTypes[] = {"isentropic_vortex", "test_1",      "test_2", "test_3", "test_4", "test_5",
-                                             "EulerTestType",     "EULER_TEST_", NULL};
+static const char *const EulerTestTypes[] = {"ISENTROPIC_VORTEX", "1", "2", "3", "4", "5", "EulerTestType", "EULER_TEST_", NULL};
 
 // Advection - Wind types
-static const char *const WindTypes[] = {"rotation", "translation", "WindType", "WIND_", NULL};
+static const char *const WindTypes[] = {"ROTATION", "TRANSLATION", "WindType", "WIND_", NULL};
 
 // Advection - Initial Condition Types
-static const char *const AdvectionICTypes[] = {"sphere", "cylinder", "cosine_hill", "skew", "AdvectionICType", "ADVECTIONIC_", NULL};
+static const char *const AdvectionICTypes[] = {"SPHERE", "CYLINDER", "COSINE_HILL", "SKEW", "AdvectionICType", "ADVECTIONIC_", NULL};
 
 // Advection - Bubble Continuity Types
-static const char *const BubbleContinuityTypes[] = {"smooth", "back_sharp", "thick", "cosine", "BubbleContinuityType", "BUBBLE_CONTINUITY_", NULL};
+static const char *const BubbleContinuityTypes[] = {"SMOOTH", "BACK_SHARP", "THICK", "COSINE", "BubbleContinuityType", "BUBBLE_CONTINUITY_", NULL};
 
 // Stabilization methods
-static const char *const StabilizationTypes[] = {"none", "SU", "SUPG", "StabilizationType", "STAB_", NULL};
+static const char *const StabilizationTypes[] = {"NONE", "SU", "SUPG", "StabilizationType", "STAB_", NULL};
 
 // Stabilization tau constants
-static const char *const StabilizationTauTypes[] = {"Ctau", "AdvDiff_Shakib", "AdvDiff_Shakib_P", "StabilizationTauType", "STAB_TAU_", NULL};
+static const char *const StabilizationTauTypes[] = {"CTAU", "ADVDIFF_SHAKIB", "ADVDIFF_SHAKIB_P", "StabilizationTauType", "STAB_TAU_", NULL};
 
 // Test mode type
 typedef enum {
@@ -57,50 +58,29 @@ typedef enum {
   TESTTYPE_TURB_SPANSTATS = 2,
   TESTTYPE_DIFF_FILTER    = 3,
 } TestType;
-static const char *const TestTypes[] = {"none", "solver", "turb_spanstats", "diff_filter", "TestType", "TESTTYPE_", NULL};
-
-// Subgrid-Stress mode type
-typedef enum {
-  SGS_MODEL_NONE        = 0,
-  SGS_MODEL_DATA_DRIVEN = 1,
-} SGSModelType;
-static const char *const SGSModelTypes[] = {"none", "data_driven", "SGSModelType", "SGS_MODEL_", NULL};
+static const char *const TestTypes[] = {"NONE", "SOLVER", "TURB_SPANSTATS", "DIFF_FILTER", "TestType", "TESTTYPE_", NULL};
 
 // Mesh transformation type
 typedef enum {
   MESH_TRANSFORM_NONE      = 0,
   MESH_TRANSFORM_PLATEMESH = 1,
 } MeshTransformType;
-static const char *const MeshTransformTypes[] = {"none", "platemesh", "MeshTransformType", "MESH_TRANSFORM_", NULL};
+static const char *const MeshTransformTypes[] = {"NONE", "PLATEMESH", "MeshTransformType", "MESH_TRANSFORM_", NULL};
 
 static const char *const DifferentialFilterDampingFunctions[] = {
-    "none", "van_driest", "mms", "DifferentialFilterDampingFunction", "DIFF_FILTER_DAMP_", NULL};
-
-// -----------------------------------------------------------------------------
-// Log Events
-// -----------------------------------------------------------------------------
-extern PetscLogEvent FLUIDS_CeedOperatorApply;
-extern PetscLogEvent FLUIDS_CeedOperatorAssemble;
-extern PetscLogEvent FLUIDS_CeedOperatorAssembleDiagonal;
-extern PetscLogEvent FLUIDS_CeedOperatorAssemblePointBlockDiagonal;
-extern PetscLogEvent FLUIDS_SmartRedis_Init;
-extern PetscLogEvent FLUIDS_SmartRedis_Meta;
-extern PetscLogEvent FLUIDS_SmartRedis_Train;
-extern PetscLogEvent FLUIDS_TrainDataCompute;
-extern PetscLogEvent FLUIDS_DifferentialFilter;
-extern PetscLogEvent FLUIDS_VelocityGradientProjection;
-PetscErrorCode       RegisterLogEvents();
+    "NONE", "VAN_DRIEST", "MMS", "DifferentialFilterDampingFunction", "DIFF_FILTER_DAMP_", NULL};
 
 // -----------------------------------------------------------------------------
 // Structs
 // -----------------------------------------------------------------------------
 // Structs declarations
-typedef struct AppCtx_private   *AppCtx;
-typedef struct CeedData_private *CeedData;
-typedef struct User_private     *User;
-typedef struct Units_private    *Units;
-typedef struct SimpleBC_private *SimpleBC;
-typedef struct Physics_private  *Physics;
+typedef struct AppCtx_private      *AppCtx;
+typedef struct CeedData_private    *CeedData;
+typedef struct User_private        *User;
+typedef struct Units_private       *Units;
+typedef struct SimpleBC_private    *SimpleBC;
+typedef struct Physics_private     *Physics;
+typedef struct ProblemData_private *ProblemData;
 
 // Application context from user command line options
 struct AppCtx_private {
@@ -141,9 +121,6 @@ struct AppCtx_private {
     PetscViewerFormat viewer_format;
     PetscBool         header_written;
   } wall_forces;
-  // Subgrid Stress Model
-  SGSModelType sgs_model_type;
-  PetscBool    sgs_train_enable;
   // Differential Filtering
   PetscBool         diff_filter_monitor;
   MeshTransformType mesh_transform_type;
@@ -152,12 +129,9 @@ struct AppCtx_private {
 // libCEED data struct
 struct CeedData_private {
   CeedVector           x_coord, q_data;
-  CeedBasis            basis_x, basis_xc, basis_q, basis_x_sur, basis_q_sur;
+  CeedBasis            basis_x, basis_q;
   CeedElemRestriction  elem_restr_x, elem_restr_q, elem_restr_qd_i;
-  CeedOperator         op_setup_vol;
   OperatorApplyContext op_ics_ctx;
-  CeedQFunction        qf_setup_vol, qf_ics, qf_rhs_vol, qf_ifunction_vol, qf_setup_sur, qf_apply_inflow, qf_apply_inflow_jacobian, qf_apply_outflow,
-      qf_apply_outflow_jacobian, qf_apply_freestream, qf_apply_freestream_jacobian, qf_apply_slip, qf_apply_slip_jacobian;
 };
 
 typedef struct {
@@ -180,29 +154,6 @@ typedef struct {
   KSP                  ksp;
 } *NodalProjectionData;
 
-typedef PetscErrorCode (*SgsDDNodalStressEval)(User user, Vec Q_loc, Vec VelocityGradient, Vec SGSNodal_loc);
-typedef PetscErrorCode (*SgsDDNodalStressInference)(Vec DD_Inputs_loc, Vec DD_Outputs_loc, void *ctx);
-typedef struct {
-  DM                        dm_sgs, dm_dd_inputs, dm_dd_outputs;
-  PetscInt                  num_comp_sgs, num_comp_inputs, num_comp_outputs;
-  OperatorApplyContext      op_nodal_evaluation_ctx, op_nodal_dd_inputs_ctx, op_nodal_dd_outputs_ctx, op_sgs_apply_ctx;
-  CeedVector                sgs_nodal_ceed, grad_velo_ceed;
-  SgsDDNodalStressEval      sgs_nodal_eval;
-  SgsDDNodalStressInference sgs_nodal_inference;
-  void                     *sgs_nodal_inference_ctx;
-  PetscErrorCode (*sgs_nodal_inference_ctx_destroy)(void *ctx);
-} *SgsDDData;
-
-typedef struct {
-  DM                   dm_dd_training;
-  PetscInt             num_comp_dd_inputs, write_data_interval, num_filter_widths;
-  PetscScalar          filter_widths[16];
-  OperatorApplyContext op_training_data_calc_ctx;
-  NodalProjectionData  filtered_grad_velo_proj;
-  size_t               training_data_array_dims[2];
-  PetscBool            overwrite_training_data;
-} *SGS_DD_TrainingData;
-
 typedef struct {
   DM                    dm_filter;
   PetscInt              num_filtered_fields;
@@ -215,12 +166,6 @@ typedef struct {
   CeedContextFieldLabel filter_width_scaling_label;
 } *DiffFilterData;
 
-typedef struct {
-  void    *client;
-  char     rank_id_name[16];
-  PetscInt collocated_database_num_ranks;
-} *SmartSimData;
-
 // PETSc user data
 struct User_private {
   MPI_Comm             comm;
@@ -233,17 +178,14 @@ struct User_private {
   Physics              phys;
   AppCtx               app_ctx;
   CeedVector           q_ceed, q_dot_ceed, g_ceed, x_ceed;
-  CeedOperator         op_rhs_vol, op_ifunction_vol, op_ifunction;
+  CeedOperator         op_ifunction;
   Mat                  mat_ijacobian;
   KSP                  mass_ksp;
   OperatorApplyContext op_rhs_ctx, op_strong_bc_ctx;
   CeedScalar           time_bc_set;
   SpanStatsData        spanstats;
   NodalProjectionData  grad_velo_proj;
-  SgsDDData            sgs_dd_data;
   DiffFilterData       diff_filter;
-  SmartSimData         smartsim;
-  SGS_DD_TrainingData  sgs_dd_train;
 };
 
 // Units
@@ -263,12 +205,8 @@ struct Units_private {
 
 // Boundary conditions
 struct SimpleBC_private {
-  PetscInt num_wall,  // Number of faces with wall BCs
-      wall_comps[5],  // An array of constrained component numbers
-      num_comps,
-      num_symmetry[3],  // Number of faces with symmetry BCs
-      num_inflow, num_outflow, num_freestream, num_slip;
-  PetscInt walls[16], symmetries[3][16], inflows[16], outflows[16], freestreams[16], slips[16];
+  PetscInt num_inflow, num_outflow, num_freestream, num_slip;
+  PetscInt inflows[16], outflows[16], freestreams[16], slips[16];
 };
 
 // Struct that contains all enums and structs used for the physics of all problems
@@ -279,9 +217,10 @@ struct Physics_private {
   CeedContextFieldLabel stg_solution_time_label;
   CeedContextFieldLabel timestep_size_label;
   CeedContextFieldLabel ics_time_label;
-  CeedContextFieldLabel ijacobian_time_shift_label;
 };
 
+PetscErrorCode BoundaryConditionSetUp(User user, ProblemData problem, AppCtx app_ctx, SimpleBC bc);
+
 typedef struct {
   CeedQFunctionUser    qfunction;
   const char          *qfunction_loc;
@@ -289,14 +228,15 @@ typedef struct {
 } ProblemQFunctionSpec;
 
 // Problem specific data
-typedef struct ProblemData_private *ProblemData;
 struct ProblemData_private {
   CeedInt              dim, q_data_size_vol, q_data_size_sur, jac_data_size_sur;
   CeedScalar           dm_scale;
-  ProblemQFunctionSpec setup_vol, setup_sur, ics, apply_vol_rhs, apply_vol_ifunction, apply_vol_ijacobian, apply_inflow, apply_outflow,
-      apply_freestream, apply_slip, apply_inflow_jacobian, apply_outflow_jacobian, apply_freestream_jacobian, apply_slip_jacobian;
-  bool      non_zero_time;
-  PetscBool bc_from_ics, use_strong_bc_ceed, uses_newtonian;
+  ProblemQFunctionSpec ics, apply_vol_rhs, apply_vol_ifunction, apply_vol_ijacobian, apply_inflow, apply_outflow, apply_freestream, apply_slip,
+      apply_inflow_jacobian, apply_outflow_jacobian, apply_freestream_jacobian, apply_slip_jacobian;
+  bool          compute_exact_solution_error;
+  PetscBool     set_bc_from_ics, use_strong_bc_ceed, uses_newtonian;
+  PetscCount    num_bc_defs;
+  BCDefinition *bc_defs;
   PetscErrorCode (*print_info)(User, ProblemData, AppCtx);
   PetscErrorCode (*create_mass_operator)(User, CeedOperator *);
 };
@@ -329,7 +269,7 @@ extern PetscErrorCode PRINT_ADVECTION(User user, ProblemData problem, AppCtx app
 
 extern PetscErrorCode PRINT_ADVECTION2D(User user, ProblemData problem, AppCtx app_ctx);
 
-PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MPI_Comm comm);
+PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, TS ts);
 
 // -----------------------------------------------------------------------------
 // libCEED functions
@@ -349,13 +289,14 @@ PetscErrorCode DMPlexCeedElemRestrictionCollocatedCreate(Ceed ceed, DM dm, DMLab
 
 PetscErrorCode CreateBasisFromPlex(Ceed ceed, DM dm, DMLabel domain_label, CeedInt label_value, CeedInt height, CeedInt dm_field, CeedBasis *basis);
 
-// Utility function to create CEED Composite Operator for the entire domain
-PetscErrorCode CreateOperatorForDomain(Ceed ceed, DM dm, SimpleBC bc, CeedData ceed_data, Physics phys, CeedOperator op_apply_vol,
-                                       CeedOperator op_apply_ijacobian_vol, CeedInt height, CeedInt P_sur, CeedInt Q_sur, CeedInt q_data_size_sur,
-                                       CeedInt jac_data_size_sur, CeedOperator *op_apply, CeedOperator *op_apply_ijacobian);
-
 PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData problem, SimpleBC bc);
 
+PetscErrorCode QDataGet(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, CeedElemRestriction elem_restr_x, CeedBasis basis_x,
+                        CeedVector x_coord, CeedElemRestriction *elem_restr_qd, CeedVector *q_data, CeedInt *q_data_size);
+PetscErrorCode QDataGetNumComponents(DM dm, CeedInt *q_data_size);
+PetscErrorCode QDataBoundaryGet(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, CeedElemRestriction elem_restr_x, CeedBasis basis_x,
+                                CeedVector x_coord, CeedElemRestriction *elem_restr_qd, CeedVector *q_data, CeedInt *q_data_size);
+PetscErrorCode QDataBoundaryGetNumComponents(DM dm, CeedInt *q_data_size);
 // -----------------------------------------------------------------------------
 // Time-stepping functions
 // -----------------------------------------------------------------------------
@@ -369,7 +310,7 @@ PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, void *u
 PetscErrorCode TSMonitor_NS(TS ts, PetscInt step_no, PetscReal time, Vec Q, void *ctx);
 
 // TS: Create, setup, and solve
-PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Vec *Q, PetscScalar *f_time, TS *ts);
+PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, ProblemData problem, Vec *Q, PetscScalar *f_time, TS *ts);
 
 // Update Boundary Values when time has changed
 PetscErrorCode UpdateBoundaryValues(User user, Vec Q_loc, PetscReal t);
@@ -454,10 +395,6 @@ PetscErrorCode TurbulenceStatisticsDestroy(User user, CeedData ceed_data);
 // -----------------------------------------------------------------------------
 // Data-Driven Subgrid Stress (DD-SGS) Modeling Functions
 // -----------------------------------------------------------------------------
-
-PetscErrorCode SgsDDSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem);
-PetscErrorCode SgsDDDataDestroy(SgsDDData sgs_dd_data);
-PetscErrorCode SgsDDApplyIFunction(User user, const Vec Q_loc, Vec G_loc);
 PetscErrorCode VelocityGradientProjectionSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem, StateVariable state_var_input,
                                                CeedElemRestriction elem_restr_input, CeedBasis basis_input, NodalProjectionData *pgrad_velo_proj);
 PetscErrorCode VelocityGradientProjectionApply(NodalProjectionData grad_velo_proj, Vec Q_loc, Vec VelocityGradient);
@@ -486,13 +423,3 @@ PetscErrorCode DifferentialFilterDataDestroy(DiffFilterData diff_filter);
 PetscErrorCode TSMonitor_DifferentialFilter(TS ts, PetscInt steps, PetscReal solution_time, Vec Q, void *ctx);
 PetscErrorCode DifferentialFilterApply(User user, const PetscReal solution_time, const Vec Q, Vec Filtered_Solution);
 PetscErrorCode DifferentialFilterMmsICSetup(ProblemData problem);
-
-// -----------------------------------------------------------------------------
-// SGS Data-Driven Training via SmartSim
-// -----------------------------------------------------------------------------
-PetscErrorCode SmartSimSetup(User user);
-PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim);
-PetscErrorCode SGS_DD_TrainingSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem);
-PetscErrorCode TSMonitor_SGS_DD_Training(TS ts, PetscInt step_num, PetscReal solution_time, Vec Q, void *ctx);
-PetscErrorCode TSPostStep_SGS_DD_Training(TS ts);
-PetscErrorCode SGS_DD_TrainingDataDestroy(SGS_DD_TrainingData sgs_dd_train);
diff --git a/examples/fluids/problems/advection.c b/examples/fluids/problems/advection.c
index 1d29b2cddb..79275a231c 100644
--- a/examples/fluids/problems/advection.c
+++ b/examples/fluids/problems/advection.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -14,8 +14,6 @@
 #include <petscdm.h>
 
 #include "../navierstokes.h"
-#include "../qfunctions/setupgeo.h"
-#include "../qfunctions/setupgeo2d.h"
 
 // @brief Create CeedOperator for stabilized mass KSP for explicit timestepping
 //
@@ -37,14 +35,12 @@ PetscErrorCode CreateKSPMassOperator_AdvectionStabilized(User user, CeedOperator
     CeedOperatorField field;
     PetscInt          sub_op_index = 0;  // will be 0 for the volume op
 
-    PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(user->op_rhs_ctx->op, &sub_ops));
+    PetscCallCeed(ceed, CeedOperatorCompositeGetSubList(user->op_rhs_ctx->op, &sub_ops));
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q));
-    PetscCallCeed(ceed, CeedOperatorFieldGetBasis(field, &basis_q));
+    PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_q, &basis_q, NULL));
 
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "qdata", &field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_qd_i));
-    PetscCallCeed(ceed, CeedOperatorFieldGetVector(field, &q_data));
+    PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_qd_i, NULL, &q_data));
 
     PetscCallCeed(ceed, CeedOperatorGetContext(sub_ops[sub_op_index], &qf_ctx));
   }
@@ -76,6 +72,11 @@ PetscErrorCode CreateKSPMassOperator_AdvectionStabilized(User user, CeedOperator
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "Grad_v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
 
+  PetscCallCeed(ceed, CeedVectorDestroy(&q_data));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd_i));
+  PetscCallCeed(ceed, CeedBasisDestroy(&basis_q));
+  PetscCallCeed(ceed, CeedQFunctionContextDestroy(&qf_ctx));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -106,12 +107,6 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc)
   switch (dim) {
     case 2:
       problem->dim                               = 2;
-      problem->q_data_size_vol                   = 5;
-      problem->q_data_size_sur                   = 3;
-      problem->setup_vol.qfunction               = Setup2d;
-      problem->setup_vol.qfunction_loc           = Setup2d_loc;
-      problem->setup_sur.qfunction               = SetupBoundary2d;
-      problem->setup_sur.qfunction_loc           = SetupBoundary2d_loc;
       problem->ics.qfunction                     = ICsAdvection2d;
       problem->ics.qfunction_loc                 = ICsAdvection2d_loc;
       problem->apply_vol_rhs.qfunction           = RHS_Advection2d;
@@ -120,17 +115,11 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc)
       problem->apply_vol_ifunction.qfunction_loc = IFunction_Advection2d_loc;
       problem->apply_inflow.qfunction            = Advection2d_InOutFlow;
       problem->apply_inflow.qfunction_loc        = Advection2d_InOutFlow_loc;
-      problem->non_zero_time                     = PETSC_TRUE;
+      problem->compute_exact_solution_error      = PETSC_TRUE;
       problem->print_info                        = PRINT_ADVECTION;
       break;
     case 3:
       problem->dim                               = 3;
-      problem->q_data_size_vol                   = 10;
-      problem->q_data_size_sur                   = 10;
-      problem->setup_vol.qfunction               = Setup;
-      problem->setup_vol.qfunction_loc           = Setup_loc;
-      problem->setup_sur.qfunction               = SetupBoundary;
-      problem->setup_sur.qfunction_loc           = SetupBoundary_loc;
       problem->ics.qfunction                     = ICsAdvection;
       problem->ics.qfunction_loc                 = ICsAdvection_loc;
       problem->apply_vol_rhs.qfunction           = RHS_Advection;
@@ -139,7 +128,7 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc)
       problem->apply_vol_ifunction.qfunction_loc = IFunction_Advection_loc;
       problem->apply_inflow.qfunction            = Advection_InOutFlow;
       problem->apply_inflow.qfunction_loc        = Advection_InOutFlow_loc;
-      problem->non_zero_time                     = PETSC_FALSE;
+      problem->compute_exact_solution_error      = PETSC_FALSE;
       problem->print_info                        = PRINT_ADVECTION;
       break;
   }
@@ -210,8 +199,8 @@ PetscErrorCode NS_ADVECTION(ProblemData problem, DM dm, void *ctx, SimpleBC bc)
   }
   if (wind_type == WIND_TRANSLATION && advectionic_type == ADVECTIONIC_BUBBLE_CYLINDER && wind[2] != 0.) {
     wind[2] = 0;
-    PetscCall(
-        PetscPrintf(comm, "Warning! Background wind in the z direction should be zero (-wind_translation x,x,0) with -advection_ic_type cylinder\n"));
+    PetscCall(PetscPrintf(comm,
+                          "Warning! Background wind in the z direction should be zero (-wind_translation x,x,0) with -advection_ic_type cylinder\n"));
   }
   if (stab == STAB_NONE && CtauS != 0) {
     PetscCall(PetscPrintf(comm, "Warning! Use -CtauS only with -stab su or -stab supg\n"));
@@ -309,8 +298,8 @@ PetscErrorCode PRINT_ADVECTION(User user, ProblemData problem, AppCtx app_ctx) {
         PetscCall(PetscPrintf(comm, "    Background Wind                    : %f,%f\n", setup_ctx->wind[0], setup_ctx->wind[1]));
         break;
       case 3:
-        PetscCall(
-            PetscPrintf(comm, "    Background Wind                    : %f,%f,%f\n", setup_ctx->wind[0], setup_ctx->wind[1], setup_ctx->wind[2]));
+        PetscCall(PetscPrintf(comm, "    Background Wind                    : %f,%f,%f\n", setup_ctx->wind[0], setup_ctx->wind[1],
+                              setup_ctx->wind[2]));
         break;
     }
   }
diff --git a/examples/fluids/problems/bc_freestream.c b/examples/fluids/problems/bc_freestream.c
index cff3c74e67..b2f23f786e 100644
--- a/examples/fluids/problems/bc_freestream.c
+++ b/examples/fluids/problems/bc_freestream.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,7 +16,9 @@
 #include "../navierstokes.h"
 #include "../qfunctions/newtonian_types.h"
 
-static const char *const RiemannSolverTypes[] = {"hll", "hllc", "RiemannSolverTypes", "RIEMANN_", NULL};
+static const char *const RiemannSolverTypes[] = {"HLL", "HLLC", "RiemannSolverTypes", "RIEMANN_", NULL};
+
+static PetscErrorCode RiemannSolverUnitTests(NewtonianIdealGasContext gas, CeedScalar rtol);
 
 PetscErrorCode FreestreamBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianIdealGasContext newtonian_ig_ctx, const StatePrimitive *reference) {
   User                 user = *(User *)ctx;
@@ -78,6 +80,22 @@ PetscErrorCode FreestreamBCSetup(ProblemData problem, DM dm, void *ctx, Newtonia
           break;
       }
       break;
+    case STATEVAR_ENTROPY:
+      switch (riemann) {
+        case RIEMANN_HLL:
+          problem->apply_freestream.qfunction              = Freestream_Entropy_HLL;
+          problem->apply_freestream.qfunction_loc          = Freestream_Entropy_HLL_loc;
+          problem->apply_freestream_jacobian.qfunction     = Freestream_Jacobian_Entropy_HLL;
+          problem->apply_freestream_jacobian.qfunction_loc = Freestream_Jacobian_Entropy_HLL_loc;
+          break;
+        case RIEMANN_HLLC:
+          problem->apply_freestream.qfunction              = Freestream_Entropy_HLLC;
+          problem->apply_freestream.qfunction_loc          = Freestream_Entropy_HLLC_loc;
+          problem->apply_freestream_jacobian.qfunction     = Freestream_Jacobian_Entropy_HLLC;
+          problem->apply_freestream_jacobian.qfunction_loc = Freestream_Jacobian_Entropy_HLLC_loc;
+          break;
+      }
+      break;
   }
 
   Y_inf.pressure *= Pascal;
@@ -96,6 +114,13 @@ PetscErrorCode FreestreamBCSetup(ProblemData problem, DM dm, void *ctx, Newtonia
   PetscCallCeed(ceed, CeedQFunctionContextSetDataDestroy(freestream_context, CEED_MEM_HOST, FreeContextPetsc));
   problem->apply_freestream.qfunction_context = freestream_context;
   PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(freestream_context, &problem->apply_freestream_jacobian.qfunction_context));
+
+  {
+    PetscBool run_unit_tests = PETSC_FALSE;
+
+    PetscCall(PetscOptionsGetBool(NULL, NULL, "-riemann_solver_unit_tests", &run_unit_tests, NULL));
+    if (run_unit_tests) PetscCall(RiemannSolverUnitTests(newtonian_ig_ctx, 5e-7));
+  }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -119,13 +144,13 @@ PetscErrorCode OutflowBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianId
   CeedScalar temperature = reference->temperature / Kelvin;
   CeedScalar recirc = 1, softplus_velocity = 1e-2;
   PetscOptionsBegin(user->comm, NULL, "Options for Outflow boundary condition", NULL);
-  PetscCall(
-      PetscOptionsEnum("-outflow_type", "Type of outflow condition", NULL, OutflowTypes, (PetscEnum)outflow_type, (PetscEnum *)&outflow_type, NULL));
+  PetscCall(PetscOptionsEnum("-outflow_type", "Type of outflow condition", NULL, OutflowTypes, (PetscEnum)outflow_type, (PetscEnum *)&outflow_type,
+                             NULL));
   PetscCall(PetscOptionsScalar("-outflow_pressure", "Pressure at outflow condition", NULL, pressure, &pressure, NULL));
   if (outflow_type == OUTFLOW_RIEMANN) {
     PetscCall(PetscOptionsScalar("-outflow_temperature", "Temperature at outflow condition", NULL, temperature, &temperature, NULL));
-    PetscCall(
-        PetscOptionsReal("-outflow_recirc", "Fraction of recirculation to allow in exterior velocity state [0,1]", NULL, recirc, &recirc, NULL));
+    PetscCall(PetscOptionsReal("-outflow_recirc", "Fraction of recirculation to allow in exterior velocity state [0,1]", NULL, recirc, &recirc,
+                               NULL));
     PetscCall(PetscOptionsReal("-outflow_softplus_velocity", "Characteristic velocity of softplus regularization", NULL, softplus_velocity,
                                &softplus_velocity, NULL));
   }
@@ -148,6 +173,12 @@ PetscErrorCode OutflowBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianId
           problem->apply_outflow_jacobian.qfunction     = RiemannOutflow_Jacobian_Prim;
           problem->apply_outflow_jacobian.qfunction_loc = RiemannOutflow_Jacobian_Prim_loc;
           break;
+        case STATEVAR_ENTROPY:
+          problem->apply_outflow.qfunction              = RiemannOutflow_Entropy;
+          problem->apply_outflow.qfunction_loc          = RiemannOutflow_Entropy_loc;
+          problem->apply_outflow_jacobian.qfunction     = RiemannOutflow_Jacobian_Entropy;
+          problem->apply_outflow_jacobian.qfunction_loc = RiemannOutflow_Jacobian_Entropy_loc;
+          break;
       }
       break;
     case OUTFLOW_PRESSURE:
@@ -164,6 +195,12 @@ PetscErrorCode OutflowBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianId
           problem->apply_outflow_jacobian.qfunction     = PressureOutflow_Jacobian_Prim;
           problem->apply_outflow_jacobian.qfunction_loc = PressureOutflow_Jacobian_Prim_loc;
           break;
+        case STATEVAR_ENTROPY:
+          problem->apply_outflow.qfunction              = PressureOutflow_Entropy;
+          problem->apply_outflow.qfunction_loc          = PressureOutflow_Entropy_loc;
+          problem->apply_outflow_jacobian.qfunction     = PressureOutflow_Jacobian_Entropy;
+          problem->apply_outflow_jacobian.qfunction_loc = PressureOutflow_Jacobian_Entropy_loc;
+          break;
       }
       break;
   }
@@ -181,3 +218,336 @@ PetscErrorCode OutflowBCSetup(ProblemData problem, DM dm, void *ctx, NewtonianId
   PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(outflow_context, &problem->apply_outflow_jacobian.qfunction_context));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
+
+// @brief Calculate relative error, (A - B) / S
+// If S < threshold, then set S=1
+static inline CeedScalar RelativeError(CeedScalar S, CeedScalar A, CeedScalar B, CeedScalar threshold) {
+  return (A - B) / (fabs(S) > threshold ? S : 1);
+}
+
+// @brief Check errors of a State vector and print if above tolerance
+static PetscErrorCode CheckQWithTolerance(const CeedScalar Q_s[5], const CeedScalar Q_a[5], const CeedScalar Q_b[5], const char *name,
+                                          PetscReal rtol_0, PetscReal rtol_u, PetscReal rtol_4) {
+  CeedScalar relative_error[5];  // relative error
+  CeedScalar divisor_threshold = 10 * CEED_EPSILON;
+
+  PetscFunctionBeginUser;
+  relative_error[0] = RelativeError(Q_s[0], Q_a[0], Q_b[0], divisor_threshold);
+  relative_error[4] = RelativeError(Q_s[4], Q_a[4], Q_b[4], divisor_threshold);
+
+  CeedScalar u_magnitude = sqrt(Square(Q_s[1]) + Square(Q_s[2]) + Square(Q_s[3]));
+  for (int i = 1; i < 4; i++) {
+    relative_error[i] = RelativeError(u_magnitude, Q_a[i], Q_b[i], divisor_threshold);
+  }
+
+  if (fabs(relative_error[0]) >= rtol_0) {
+    printf("%s[0] error %g (expected %.10e, got %.10e)\n", name, relative_error[0], Q_s[0], Q_a[0]);
+  }
+  for (int i = 1; i < 4; i++) {
+    if (fabs(relative_error[i]) >= rtol_u) {
+      printf("%s[%d] error %g (expected %.10e, got %.10e)\n", name, i, relative_error[i], Q_s[i], Q_a[i]);
+    }
+  }
+  if (fabs(relative_error[4]) >= rtol_4) {
+    printf("%s[4] error %g (expected %.10e, got %.10e)\n", name, relative_error[4], Q_s[4], Q_a[4]);
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Verify RiemannFlux_HLL_fwd function against finite-difference approximation
+static PetscErrorCode TestRiemannHLL_fwd(NewtonianIdealGasContext gas, CeedScalar rtol_0, CeedScalar rtol_u, CeedScalar rtol_4) {
+  CeedScalar       eps = 4e-7;  // Finite difference step
+  char             buf[128];
+  const CeedScalar T           = 200;
+  const CeedScalar rho         = 1.2;
+  const CeedScalar p           = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T;
+  const CeedScalar u_base      = 40;
+  const CeedScalar u[3]        = {u_base, u_base * 1.1, u_base * 1.2};
+  const CeedScalar Y0_left[5]  = {p, u[0], u[1], u[2], T};
+  const CeedScalar Y0_right[5] = {1.2 * p, 1.2 * u[0], 1.2 * u[1], 1.2 * u[2], 1.2 * T};
+  CeedScalar       normal[3]   = {1, 2, 3};
+
+  PetscFunctionBeginUser;
+  State left0  = StateFromY(gas, Y0_left);
+  State right0 = StateFromY(gas, Y0_right);
+  ScaleN(normal, 1 / sqrt(Dot3(normal, normal)), 3);
+
+  for (int i = 0; i < 10; i++) {
+    CeedScalar dFlux[5] = {0.}, dFlux_fd[5] = {0.};
+    {  // Calculate dFlux using *_fwd function
+      CeedScalar dY_right[5] = {0};
+      CeedScalar dY_left[5]  = {0};
+
+      if (i < 5) {
+        dY_left[i] = Y0_left[i];
+      } else {
+        dY_right[i % 5] = Y0_right[i % 5];
+      }
+      State dleft0  = StateFromY_fwd(gas, left0, dY_left);
+      State dright0 = StateFromY_fwd(gas, right0, dY_right);
+
+      StateConservative dFlux_state = RiemannFlux_HLL_fwd(gas, left0, dleft0, right0, dright0, normal);
+      UnpackState_U(dFlux_state, dFlux);
+    }
+
+    {  // Calculate dFlux_fd via finite difference approximation
+      CeedScalar Y1_left[5]  = {Y0_left[0], Y0_left[1], Y0_left[2], Y0_left[3], Y0_left[4]};
+      CeedScalar Y1_right[5] = {Y0_right[0], Y0_right[1], Y0_right[2], Y0_right[3], Y0_right[4]};
+      CeedScalar Flux0[5], Flux1[5];
+
+      if (i < 5) {
+        Y1_left[i] *= 1 + eps;
+      } else {
+        Y1_right[i % 5] *= 1 + eps;
+      }
+      State left1  = StateFromY(gas, Y1_left);
+      State right1 = StateFromY(gas, Y1_right);
+
+      StateConservative Flux0_state = RiemannFlux_HLL(gas, left0, right0, normal);
+      StateConservative Flux1_state = RiemannFlux_HLL(gas, left1, right1, normal);
+      UnpackState_U(Flux0_state, Flux0);
+      UnpackState_U(Flux1_state, Flux1);
+      for (int j = 0; j < 5; j++) dFlux_fd[j] = (Flux1[j] - Flux0[j]) / eps;
+    }
+
+    snprintf(buf, sizeof buf, "RiemannFlux_HLL i=%d: Flux", i);
+    PetscCall(CheckQWithTolerance(dFlux_fd, dFlux, dFlux_fd, buf, rtol_0, rtol_u, rtol_4));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Verify RiemannFlux_HLLC_fwd function against finite-difference approximation
+static PetscErrorCode TestRiemannHLLC_fwd(NewtonianIdealGasContext gas, CeedScalar rtol_0, CeedScalar rtol_u, CeedScalar rtol_4) {
+  CeedScalar       eps = 4e-7;  // Finite difference step
+  char             buf[128];
+  const CeedScalar T           = 200;
+  const CeedScalar rho         = 1.2;
+  const CeedScalar p           = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T;
+  const CeedScalar u_base      = 40;
+  const CeedScalar u[3]        = {u_base, u_base * 1.1, u_base * 1.2};
+  const CeedScalar Y0_left[5]  = {p, u[0], u[1], u[2], T};
+  const CeedScalar Y0_right[5] = {1.2 * p, 1.2 * u[0], 1.2 * u[1], 1.2 * u[2], 1.2 * T};
+  CeedScalar       normal[3]   = {1, 2, 3};
+
+  PetscFunctionBeginUser;
+  State left0  = StateFromY(gas, Y0_left);
+  State right0 = StateFromY(gas, Y0_right);
+  ScaleN(normal, 1 / sqrt(Dot3(normal, normal)), 3);
+
+  for (int i = 0; i < 10; i++) {
+    CeedScalar dFlux[5] = {0.}, dFlux_fd[5] = {0.};
+    {  // Calculate dFlux using *_fwd function
+      CeedScalar dY_right[5] = {0};
+      CeedScalar dY_left[5]  = {0};
+
+      if (i < 5) {
+        dY_left[i] = Y0_left[i];
+      } else {
+        dY_right[i % 5] = Y0_right[i % 5];
+      }
+      State dleft0  = StateFromY_fwd(gas, left0, dY_left);
+      State dright0 = StateFromY_fwd(gas, right0, dY_right);
+
+      StateConservative dFlux_state = RiemannFlux_HLLC_fwd(gas, left0, dleft0, right0, dright0, normal);
+      UnpackState_U(dFlux_state, dFlux);
+    }
+
+    {  // Calculate dFlux_fd via finite difference approximation
+      CeedScalar Y1_left[5]  = {Y0_left[0], Y0_left[1], Y0_left[2], Y0_left[3], Y0_left[4]};
+      CeedScalar Y1_right[5] = {Y0_right[0], Y0_right[1], Y0_right[2], Y0_right[3], Y0_right[4]};
+      CeedScalar Flux0[5], Flux1[5];
+
+      if (i < 5) {
+        Y1_left[i] *= 1 + eps;
+      } else {
+        Y1_right[i % 5] *= 1 + eps;
+      }
+      State left1  = StateFromY(gas, Y1_left);
+      State right1 = StateFromY(gas, Y1_right);
+
+      StateConservative Flux0_state = RiemannFlux_HLLC(gas, left0, right0, normal);
+      StateConservative Flux1_state = RiemannFlux_HLLC(gas, left1, right1, normal);
+      UnpackState_U(Flux0_state, Flux0);
+      UnpackState_U(Flux1_state, Flux1);
+      for (int j = 0; j < 5; j++) dFlux_fd[j] = (Flux1[j] - Flux0[j]) / eps;
+    }
+
+    snprintf(buf, sizeof buf, "RiemannFlux_HLLC i=%d: Flux", i);
+    PetscCall(CheckQWithTolerance(dFlux_fd, dFlux, dFlux_fd, buf, rtol_0, rtol_u, rtol_4));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Verify ComputeHLLSpeeds_Roe_fwd function against finite-difference approximation
+static PetscErrorCode TestComputeHLLSpeeds_Roe_fwd(NewtonianIdealGasContext gas, CeedScalar rtol) {
+  CeedScalar       eps = 4e-7;  // Finite difference step
+  char             buf[128];
+  const CeedScalar T           = 200;
+  const CeedScalar rho         = 1.2;
+  const CeedScalar p           = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T;
+  const CeedScalar u_base      = 40;
+  const CeedScalar u[3]        = {u_base, u_base * 1.1, u_base * 1.2};
+  const CeedScalar Y0_left[5]  = {p, u[0], u[1], u[2], T};
+  const CeedScalar Y0_right[5] = {1.2 * p, 1.2 * u[0], 1.2 * u[1], 1.2 * u[2], 1.2 * T};
+  CeedScalar       normal[3]   = {1, 2, 3};
+
+  PetscFunctionBeginUser;
+  State left0  = StateFromY(gas, Y0_left);
+  State right0 = StateFromY(gas, Y0_right);
+  ScaleN(normal, 1 / sqrt(Dot3(normal, normal)), 3);
+  CeedScalar u_left0  = Dot3(left0.Y.velocity, normal);
+  CeedScalar u_right0 = Dot3(right0.Y.velocity, normal);
+
+  for (int i = 0; i < 10; i++) {
+    CeedScalar ds_left, ds_right, ds_left_fd, ds_right_fd;
+    {  // Calculate ds_{left,right} using *_fwd function
+      CeedScalar dY_right[5] = {0};
+      CeedScalar dY_left[5]  = {0};
+
+      if (i < 5) {
+        dY_left[i] = Y0_left[i];
+      } else {
+        dY_right[i % 5] = Y0_right[i % 5];
+      }
+      State      dleft0   = StateFromY_fwd(gas, left0, dY_left);
+      State      dright0  = StateFromY_fwd(gas, right0, dY_right);
+      CeedScalar du_left  = Dot3(dleft0.Y.velocity, normal);
+      CeedScalar du_right = Dot3(dright0.Y.velocity, normal);
+
+      CeedScalar s_left, s_right;  // Throw away
+      ComputeHLLSpeeds_Roe_fwd(gas, left0, dleft0, u_left0, du_left, right0, dright0, u_right0, du_right, &s_left, &ds_left, &s_right, &ds_right);
+    }
+
+    {  // Calculate ds_{left,right}_fd via finite difference approximation
+      CeedScalar Y1_left[5]  = {Y0_left[0], Y0_left[1], Y0_left[2], Y0_left[3], Y0_left[4]};
+      CeedScalar Y1_right[5] = {Y0_right[0], Y0_right[1], Y0_right[2], Y0_right[3], Y0_right[4]};
+
+      if (i < 5) {
+        Y1_left[i] *= 1 + eps;
+      } else {
+        Y1_right[i % 5] *= 1 + eps;
+      }
+      State      left1    = StateFromY(gas, Y1_left);
+      State      right1   = StateFromY(gas, Y1_right);
+      CeedScalar u_left1  = Dot3(left1.Y.velocity, normal);
+      CeedScalar u_right1 = Dot3(right1.Y.velocity, normal);
+
+      CeedScalar s_left0, s_right0, s_left1, s_right1;
+      ComputeHLLSpeeds_Roe(gas, left0, u_left0, right0, u_right0, &s_left0, &s_right0);
+      ComputeHLLSpeeds_Roe(gas, left1, u_left1, right1, u_right1, &s_left1, &s_right1);
+      ds_left_fd  = (s_left1 - s_left0) / eps;
+      ds_right_fd = (s_right1 - s_right0) / eps;
+    }
+
+    snprintf(buf, sizeof buf, "ComputeHLLSpeeds_Roe i=%d:", i);
+    {
+      CeedScalar divisor_threshold = 10 * CEED_EPSILON;
+      CeedScalar ds_left_err, ds_right_err;
+
+      ds_left_err  = RelativeError(ds_left_fd, ds_left, ds_left_fd, divisor_threshold);
+      ds_right_err = RelativeError(ds_right_fd, ds_right, ds_right_fd, divisor_threshold);
+      if (fabs(ds_left_err) >= rtol) printf("%s ds_left error %g (expected %.10e, got %.10e)\n", buf, ds_left_err, ds_left_fd, ds_left);
+      if (fabs(ds_right_err) >= rtol) printf("%s ds_right error %g (expected %.10e, got %.10e)\n", buf, ds_right_err, ds_right_fd, ds_right);
+    }
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Verify TotalSpecificEnthalpy_fwd function against finite-difference approximation
+static PetscErrorCode TestTotalSpecificEnthalpy_fwd(NewtonianIdealGasContext gas, CeedScalar rtol) {
+  CeedScalar       eps = 4e-7;  // Finite difference step
+  char             buf[128];
+  const CeedScalar T      = 200;
+  const CeedScalar rho    = 1.2;
+  const CeedScalar p      = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T;
+  const CeedScalar u_base = 40;
+  const CeedScalar u[3]   = {u_base, u_base * 1.1, u_base * 1.2};
+  const CeedScalar Y0[5]  = {p, u[0], u[1], u[2], T};
+
+  PetscFunctionBeginUser;
+  State state0 = StateFromY(gas, Y0);
+
+  for (int i = 0; i < 5; i++) {
+    CeedScalar dH, dH_fd;
+    {  // Calculate dH using *_fwd function
+      CeedScalar dY[5] = {0};
+
+      dY[i]         = Y0[i];
+      State dstate0 = StateFromY_fwd(gas, state0, dY);
+      dH            = TotalSpecificEnthalpy_fwd(gas, state0, dstate0);
+    }
+
+    {  // Calculate dH_fd via finite difference approximation
+      CeedScalar H0, H1;
+      CeedScalar Y1[5] = {Y0[0], Y0[1], Y0[2], Y0[3], Y0[4]};
+      Y1[i] *= 1 + eps;
+      State state1 = StateFromY(gas, Y1);
+
+      H0    = TotalSpecificEnthalpy(gas, state0);
+      H1    = TotalSpecificEnthalpy(gas, state1);
+      dH_fd = (H1 - H0) / eps;
+    }
+
+    snprintf(buf, sizeof buf, "TotalSpecificEnthalpy i=%d:", i);
+    {
+      CeedScalar divisor_threshold = 10 * CEED_EPSILON;
+      CeedScalar dH_err;
+
+      dH_err = RelativeError(dH_fd, dH, dH_fd, divisor_threshold);
+      if (fabs(dH_err) >= rtol) printf("%s dH error %g (expected %.10e, got %.10e)\n", buf, dH_err, dH_fd, dH);
+    }
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Verify RoeSetup_fwd function against finite-difference approximation
+static PetscErrorCode TestRowSetup_fwd(NewtonianIdealGasContext gas, CeedScalar rtol) {
+  CeedScalar       eps = 4e-7;  // Finite difference step
+  char             buf[128];
+  const CeedScalar rho0[2] = {1.2, 1.4};
+
+  PetscFunctionBeginUser;
+  for (int i = 0; i < 2; i++) {
+    RoeWeights dR, dR_fd;
+    {  // Calculate using *_fwd function
+      CeedScalar drho[5] = {0};
+
+      drho[i] = rho0[i];
+      dR      = RoeSetup_fwd(rho0[0], rho0[1], drho[0], drho[1]);
+    }
+
+    {  // Calculate via finite difference approximation
+      RoeWeights R0, R1;
+      CeedScalar rho1[5] = {rho0[0], rho0[1]};
+      rho1[i] *= 1 + eps;
+
+      R0          = RoeSetup(rho0[0], rho0[1]);
+      R1          = RoeSetup(rho1[0], rho1[1]);
+      dR_fd.left  = (R1.left - R0.left) / eps;
+      dR_fd.right = (R1.right - R0.right) / eps;
+    }
+
+    snprintf(buf, sizeof buf, "RoeSetup i=%d:", i);
+    {
+      CeedScalar divisor_threshold = 10 * CEED_EPSILON;
+      RoeWeights dR_err;
+
+      dR_err.left  = RelativeError(dR_fd.left, dR.left, dR_fd.left, divisor_threshold);
+      dR_err.right = RelativeError(dR_fd.right, dR.right, dR_fd.right, divisor_threshold);
+      if (fabs(dR_err.left) >= rtol) printf("%s dR.left error %g (expected %.10e, got %.10e)\n", buf, dR_err.left, dR_fd.left, dR.left);
+      if (fabs(dR_err.right) >= rtol) printf("%s dR.right error %g (expected %.10e, got %.10e)\n", buf, dR_err.right, dR_fd.right, dR.right);
+    }
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Test Riemann solver related `*_fwd` functions via finite-difference approximation
+static PetscErrorCode RiemannSolverUnitTests(NewtonianIdealGasContext gas, CeedScalar rtol) {
+  PetscFunctionBeginUser;
+  PetscCall(TestRiemannHLL_fwd(gas, rtol, rtol, rtol));
+  PetscCall(TestRiemannHLLC_fwd(gas, rtol, rtol, rtol));
+  PetscCall(TestComputeHLLSpeeds_Roe_fwd(gas, rtol));
+  PetscCall(TestTotalSpecificEnthalpy_fwd(gas, rtol));
+  PetscCall(TestRowSetup_fwd(gas, rtol));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
diff --git a/examples/fluids/problems/bc_slip.c b/examples/fluids/problems/bc_slip.c
index 4b6708436e..727188dfe0 100644
--- a/examples/fluids/problems/bc_slip.c
+++ b/examples/fluids/problems/bc_slip.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -34,6 +34,12 @@ PetscErrorCode SlipBCSetup(ProblemData problem, DM dm, void *ctx, CeedQFunctionC
       problem->apply_slip_jacobian.qfunction     = Slip_Jacobian_Prim;
       problem->apply_slip_jacobian.qfunction_loc = Slip_Jacobian_Prim_loc;
       break;
+    case STATEVAR_ENTROPY:
+      problem->apply_slip.qfunction              = Slip_Entropy;
+      problem->apply_slip.qfunction_loc          = Slip_Entropy_loc;
+      problem->apply_slip_jacobian.qfunction     = Slip_Jacobian_Entropy;
+      problem->apply_slip_jacobian.qfunction_loc = Slip_Jacobian_Entropy_loc;
+      break;
   }
 
   PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(newtonian_ig_qfctx, &problem->apply_slip.qfunction_context));
diff --git a/examples/fluids/problems/blasius.c b/examples/fluids/problems/blasius.c
index 4fbfc977a5..b9f3654046 100644
--- a/examples/fluids/problems/blasius.c
+++ b/examples/fluids/problems/blasius.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -21,10 +21,12 @@ PetscErrorCode CompressibleBlasiusResidual(SNES snes, Vec X, Vec R, void *ctx) {
   const BlasiusContext blasius = (BlasiusContext)ctx;
   const PetscScalar   *Tf, *Th;  // Chebyshev coefficients
   PetscScalar         *r, f[4], h[4];
-  PetscInt             N = blasius->n_cheb;
+  PetscInt             N       = blasius->n_cheb;
+  State                S_infty = blasius->S_infty;
+  CeedScalar           U_infty = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity));
 
   PetscFunctionBeginUser;
-  PetscScalar Ma = Mach(&blasius->newtonian_ctx, blasius->T_inf, blasius->U_inf), Pr = Prandtl(&blasius->newtonian_ctx),
+  PetscScalar Ma = Mach(&blasius->newtonian_ctx, S_infty.Y.temperature, U_infty), Pr = Prandtl(&blasius->newtonian_ctx),
               gamma = HeatCapacityRatio(&blasius->newtonian_ctx);
 
   PetscCall(VecGetArrayRead(X, &Tf));
@@ -59,7 +61,7 @@ PetscErrorCode CompressibleBlasiusResidual(SNES snes, Vec X, Vec R, void *ctx) {
 
   // h - left end boundary condition
   ChebyshevEval(N - 1, Th, -1., blasius->eta_max, h);
-  r[N] = h[0] - blasius->T_wall / blasius->T_inf;
+  r[N] = h[0] - blasius->T_wall / S_infty.Y.temperature;
 
   // h - right end boundary condition
   ChebyshevEval(N - 1, Th, 1., blasius->eta_max, h);
@@ -117,26 +119,33 @@ static PetscErrorCode GetYNodeLocs(const MPI_Comm comm, const char path[PETSC_MA
   FILE          *fp;
   const PetscInt char_array_len = 512;
   char           line[char_array_len];
-  char         **array;
   PetscReal     *node_locs;
 
   PetscFunctionBeginUser;
   PetscCall(PetscFOpen(comm, path, "r", &fp));
   PetscCall(PetscSynchronizedFGets(comm, fp, char_array_len, line));
-  PetscCall(PetscStrToArray(line, ' ', &ndims, &array));
 
-  for (PetscInt i = 0; i < ndims; i++) dims[i] = atoi(array[i]);
+  {
+    char **array;
+
+    PetscCall(PetscStrToArray(line, ' ', &ndims, &array));
+    for (PetscInt i = 0; i < ndims; i++) dims[i] = atoi(array[i]);
+    PetscCall(PetscStrToArrayDestroy(ndims, array));
+  }
   if (ndims < 2) dims[1] = 1;  // Assume 1 column of data is not otherwise specified
   *nynodes = dims[0];
   PetscCall(PetscMalloc1(*nynodes, &node_locs));
 
   for (PetscInt i = 0; i < dims[0]; i++) {
+    char **array;
+
     PetscCall(PetscSynchronizedFGets(comm, fp, char_array_len, line));
     PetscCall(PetscStrToArray(line, ' ', &ndims, &array));
     PetscCheck(ndims == dims[1], comm, PETSC_ERR_FILE_UNEXPECTED,
                "Line %" PetscInt_FMT " of %s does not contain correct number of columns (%d instead of %d)", i, path, ndims, dims[1]);
 
     node_locs[i] = (PetscReal)atof(array[0]);
+    PetscCall(PetscStrToArrayDestroy(ndims, array));
   }
   PetscCall(PetscFClose(comm, fp));
   *pynodes = node_locs;
@@ -252,7 +261,7 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   CeedScalar T_inf                                = 288.;         // K
   CeedScalar T_wall                               = 288.;         // K
   CeedScalar delta0                               = 4.2e-3;       // m
-  CeedScalar P0                                   = 1.01e5;       // Pa
+  CeedScalar P_inf                                = 1.01e5;       // Pa
   PetscInt   N                                    = 20;           // Number of Chebyshev terms
   PetscBool  weakT                                = PETSC_FALSE;  // weak density or temperature
   PetscReal  mesh_refine_height                   = 5.9e-4;       // m
@@ -260,14 +269,19 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   PetscInt   mesh_Ndelta                          = 45;           // [-]
   PetscReal  mesh_top_angle                       = 5;            // degrees
   char       mesh_ynodes_path[PETSC_MAX_PATH_LEN] = "";
+  PetscBool  P0_set;
 
   PetscOptionsBegin(comm, NULL, "Options for BLASIUS problem", NULL);
   PetscCall(PetscOptionsBool("-weakT", "Change from rho weak to T weak at inflow", NULL, weakT, &weakT, NULL));
   PetscCall(PetscOptionsScalar("-velocity_infinity", "Velocity at boundary layer edge", NULL, U_inf, &U_inf, NULL));
   PetscCall(PetscOptionsScalar("-temperature_infinity", "Temperature at boundary layer edge", NULL, T_inf, &T_inf, NULL));
+  PetscCall(PetscOptionsHasName(NULL, NULL, "-P0", &P0_set));  // For maintaining behavior of -P0 flag (which is deprecated)
+  PetscCall(
+      PetscOptionsDeprecated("-P0", "-pressure_infinity", "libCEED 0.12.0",
+                             "Use -pressure_infinity to set pressure at boundary layer edge and -idl_pressure to set the IDL reference pressure"));
+  PetscCall(PetscOptionsScalar("-pressure_infinity", "Pressure at boundary layer edge", NULL, P_inf, &P_inf, NULL));
   PetscCall(PetscOptionsScalar("-temperature_wall", "Temperature at wall", NULL, T_wall, &T_wall, NULL));
   PetscCall(PetscOptionsScalar("-delta0", "Boundary layer height at inflow", NULL, delta0, &delta0, NULL));
-  PetscCall(PetscOptionsScalar("-P0", "Pressure at outflow", NULL, P0, &P0, NULL));
   PetscCall(PetscOptionsInt("-n_chebyshev", "Number of Chebyshev terms", NULL, N, &N, NULL));
   PetscCheck(3 <= N && N <= BLASIUS_MAX_N_CHEBYSHEV, comm, PETSC_ERR_ARG_OUTOFRANGE, "-n_chebyshev %" PetscInt_FMT " must be in range [3, %d]", N,
              BLASIUS_MAX_N_CHEBYSHEV);
@@ -276,8 +290,8 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
     PetscCall(PetscOptionsScalar("-platemesh_refine_height", "Height of boundary layer mesh refinement", NULL, mesh_refine_height,
                                  &mesh_refine_height, NULL));
     PetscCall(PetscOptionsScalar("-platemesh_growth", "Geometric growth rate of boundary layer mesh", NULL, mesh_growth, &mesh_growth, NULL));
-    PetscCall(
-        PetscOptionsScalar("-platemesh_top_angle", "Geometric top_angle rate of boundary layer mesh", NULL, mesh_top_angle, &mesh_top_angle, NULL));
+    PetscCall(PetscOptionsScalar("-platemesh_top_angle", "Geometric top_angle rate of boundary layer mesh", NULL, mesh_top_angle, &mesh_top_angle,
+                                 NULL));
     PetscCall(PetscOptionsString("-platemesh_y_node_locs_path",
                                  "Path to file with y node locations. "
                                  "If empty, will use the algorithmic mesh warping.",
@@ -293,7 +307,7 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
 
   T_inf *= Kelvin;
   T_wall *= Kelvin;
-  P0 *= Pascal;
+  P_inf *= Pascal;
   U_inf *= meter / second;
   delta0 *= meter;
 
@@ -308,15 +322,19 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   // Some properties depend on parameters from NewtonianIdealGas
   PetscCallCeed(ceed, CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ig_ctx));
 
-  blasius_ctx->weakT         = weakT;
-  blasius_ctx->U_inf         = U_inf;
-  blasius_ctx->T_inf         = T_inf;
-  blasius_ctx->T_wall        = T_wall;
-  blasius_ctx->delta0        = delta0;
-  blasius_ctx->P0            = P0;
-  blasius_ctx->n_cheb        = N;
-  newtonian_ig_ctx->P0       = P0;
-  blasius_ctx->implicit      = user->phys->implicit;
+  StatePrimitive Y_inf = {
+      .pressure = P_inf, .velocity = {U_inf, 0, 0},
+           .temperature = T_inf
+  };
+  State S_infty = StateFromPrimitive(newtonian_ig_ctx, Y_inf);
+
+  blasius_ctx->weakT    = weakT;
+  blasius_ctx->T_wall   = T_wall;
+  blasius_ctx->delta0   = delta0;
+  blasius_ctx->S_infty  = S_infty;
+  blasius_ctx->n_cheb   = N;
+  blasius_ctx->implicit = user->phys->implicit;
+  if (P0_set) newtonian_ig_ctx->idl_pressure = P_inf;  // For maintaining behavior of -P0 flag (which is deprecated)
   blasius_ctx->newtonian_ctx = *newtonian_ig_ctx;
 
   {
@@ -338,10 +356,12 @@ PetscErrorCode NS_BLASIUS(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   PetscCallCeed(ceed, CeedQFunctionContextDestroy(&problem->ics.qfunction_context));
   problem->ics.qfunction_context = blasius_context;
   if (use_stg) {
-    PetscCall(SetupStg(comm, dm, problem, user, weakT, T_inf, P0));
+    PetscCall(SetupStg(comm, dm, problem, user, weakT, S_infty.Y.temperature, S_infty.Y.pressure));
   } else if (diff_filter_mms) {
     PetscCall(DifferentialFilterMmsICSetup(problem));
   } else {
+    PetscCheck((user->phys->state_var == STATEVAR_CONSERVATIVE) || (user->app_ctx->test_type == TESTTYPE_DIFF_FILTER), user->comm,
+               PETSC_ERR_ARG_INCOMP, "Can only use conservative variables with Blasius and weak inflow");
     problem->apply_inflow.qfunction              = Blasius_Inflow;
     problem->apply_inflow.qfunction_loc          = Blasius_Inflow_loc;
     problem->apply_inflow_jacobian.qfunction     = Blasius_Inflow_Jacobian;
diff --git a/examples/fluids/problems/channel.c b/examples/fluids/problems/channel.c
index 8c0511114b..55734e042d 100644
--- a/examples/fluids/problems/channel.c
+++ b/examples/fluids/problems/channel.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/densitycurrent.c b/examples/fluids/problems/densitycurrent.c
index e49da42b5b..1dbbe36fb0 100644
--- a/examples/fluids/problems/densitycurrent.c
+++ b/examples/fluids/problems/densitycurrent.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/eulervortex.c b/examples/fluids/problems/eulervortex.c
index 0115ab5c83..34d74052ab 100644
--- a/examples/fluids/problems/eulervortex.c
+++ b/examples/fluids/problems/eulervortex.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -14,7 +14,6 @@
 #include <petscdm.h>
 
 #include "../navierstokes.h"
-#include "../qfunctions/setupgeo.h"
 
 PetscErrorCode NS_EULER_VORTEX(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   EulerTestType        euler_test;
@@ -33,12 +32,6 @@ PetscErrorCode NS_EULER_VORTEX(ProblemData problem, DM dm, void *ctx, SimpleBC b
   //               SET UP DENSITY_CURRENT
   // ------------------------------------------------------
   problem->dim                               = 3;
-  problem->q_data_size_vol                   = 10;
-  problem->q_data_size_sur                   = 10;
-  problem->setup_vol.qfunction               = Setup;
-  problem->setup_vol.qfunction_loc           = Setup_loc;
-  problem->setup_sur.qfunction               = SetupBoundary;
-  problem->setup_sur.qfunction_loc           = SetupBoundary_loc;
   problem->ics.qfunction                     = ICsEuler;
   problem->ics.qfunction_loc                 = ICsEuler_loc;
   problem->apply_vol_rhs.qfunction           = Euler;
@@ -49,7 +42,7 @@ PetscErrorCode NS_EULER_VORTEX(ProblemData problem, DM dm, void *ctx, SimpleBC b
   problem->apply_inflow.qfunction_loc        = TravelingVortex_Inflow_loc;
   problem->apply_outflow.qfunction           = Euler_Outflow;
   problem->apply_outflow.qfunction_loc       = Euler_Outflow_loc;
-  problem->non_zero_time                     = PETSC_TRUE;
+  problem->compute_exact_solution_error      = PETSC_TRUE;
   problem->print_info                        = PRINT_EULER_VORTEX;
 
   // ------------------------------------------------------
@@ -145,6 +138,7 @@ PetscErrorCode NS_EULER_VORTEX(ProblemData problem, DM dm, void *ctx, SimpleBC b
   PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_vol_ifunction.qfunction_context));
   PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_inflow.qfunction_context));
   PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_outflow.qfunction_context));
+  PetscCallCeed(ceed, CeedQFunctionContextDestroy(&euler_context));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
diff --git a/examples/fluids/problems/gaussianwave.c b/examples/fluids/problems/gaussianwave.c
index 9af7924b78..abadc453f7 100644
--- a/examples/fluids/problems/gaussianwave.c
+++ b/examples/fluids/problems/gaussianwave.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -37,6 +37,10 @@ PetscErrorCode NS_GAUSSIAN_WAVE(ProblemData problem, DM dm, void *ctx, SimpleBC
       problem->ics.qfunction     = IC_GaussianWave_Prim;
       problem->ics.qfunction_loc = IC_GaussianWave_Prim_loc;
       break;
+    case STATEVAR_ENTROPY:
+      problem->ics.qfunction     = IC_GaussianWave_Entropy;
+      problem->ics.qfunction_loc = IC_GaussianWave_Entropy_loc;
+      break;
   }
 
   // -- Option Defaults
diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c
index 61c7ec5a2d..1ab6e222ac 100644
--- a/examples/fluids/problems/newtonian.c
+++ b/examples/fluids/problems/newtonian.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -14,50 +14,140 @@
 #include <petscdm.h>
 
 #include "../navierstokes.h"
-#include "../qfunctions/setupgeo.h"
 
 // For use with PetscOptionsEnum
-static const char *const StateVariables[] = {"CONSERVATIVE", "PRIMITIVE", "StateVariable", "STATEVAR_", NULL};
+static const char *const StateVariables[] = {"CONSERVATIVE", "PRIMITIVE", "ENTROPY", "StateVariable", "STATEVAR_", NULL};
 
-// Compute relative error |a - b|/|s|
-static PetscErrorCode CheckPrimitiveWithTolerance(StatePrimitive sY, StatePrimitive aY, StatePrimitive bY, const char *name, PetscReal rtol_pressure,
-                                                  PetscReal rtol_velocity, PetscReal rtol_temperature) {
-  StatePrimitive eY;  // relative error
+static PetscErrorCode CheckQWithTolerance(const CeedScalar Q_s[5], const CeedScalar Q_a[5], const CeedScalar Q_b[5], const char *name,
+                                          PetscReal rtol_0, PetscReal rtol_u, PetscReal rtol_4) {
+  CeedScalar relative_error[5];  // relative error
+  CeedScalar divisor_threshold = 10 * CEED_EPSILON;
 
   PetscFunctionBeginUser;
-  eY.pressure   = (aY.pressure - bY.pressure) / sY.pressure;
-  PetscScalar u = sqrt(Square(sY.velocity[0]) + Square(sY.velocity[1]) + Square(sY.velocity[2]));
-  for (int j = 0; j < 3; j++) eY.velocity[j] = (aY.velocity[j] - bY.velocity[j]) / u;
-  eY.temperature = (aY.temperature - bY.temperature) / sY.temperature;
-  if (fabs(eY.pressure) > rtol_pressure) printf("%s: pressure error %g\n", name, eY.pressure);
-  for (int j = 0; j < 3; j++) {
-    if (fabs(eY.velocity[j]) > rtol_velocity) printf("%s: velocity[%d] error %g\n", name, j, eY.velocity[j]);
+  relative_error[0] = (Q_a[0] - Q_b[0]) / (fabs(Q_s[0]) > divisor_threshold ? Q_s[0] : 1);
+  relative_error[4] = (Q_a[4] - Q_b[4]) / (fabs(Q_s[4]) > divisor_threshold ? Q_s[4] : 1);
+
+  CeedScalar u_magnitude = sqrt(Square(Q_s[1]) + Square(Q_s[2]) + Square(Q_s[3]));
+  CeedScalar u_divisor   = u_magnitude > divisor_threshold ? u_magnitude : 1;
+  for (int i = 1; i < 4; i++) {
+    relative_error[i] = (Q_a[i] - Q_b[i]) / u_divisor;
+  }
+
+  if (fabs(relative_error[0]) >= rtol_0) {
+    printf("%s[0] error %g (expected %.10e, got %.10e)\n", name, relative_error[0], Q_s[0], Q_a[0]);
+  }
+  for (int i = 1; i < 4; i++) {
+    if (fabs(relative_error[i]) >= rtol_u) {
+      printf("%s[%d] error %g (expected %.10e, got %.10e)\n", name, i, relative_error[i], Q_s[i], Q_a[i]);
+    }
+  }
+  if (fabs(relative_error[4]) >= rtol_4) {
+    printf("%s[4] error %g (expected %.10e, got %.10e)\n", name, relative_error[4], Q_s[4], Q_a[4]);
   }
-  if (fabs(eY.temperature) > rtol_temperature) printf("%s: temperature error %g\n", name, eY.temperature);
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+// @brief Verify `StateFromQ` by converting A0 -> B0 -> A0_test, where A0 should equal A0_test
+static PetscErrorCode TestState(StateVariable state_var_A, StateVariable state_var_B, NewtonianIdealGasContext gas, const CeedScalar A0[5],
+                                CeedScalar rtol_0, CeedScalar rtol_u, CeedScalar rtol_4) {
+  CeedScalar        B0[5], A0_test[5];
+  char              buf[128];
+  const char *const StateVariables_Initial[] = {"U", "Y", "V"};
+
+  PetscFunctionBeginUser;
+  const char *A_initial = StateVariables_Initial[state_var_A];
+  const char *B_initial = StateVariables_Initial[state_var_B];
+
+  State state_A0 = StateFromQ(gas, A0, state_var_A);
+  StateToQ(gas, state_A0, B0, state_var_B);
+  State state_B0 = StateFromQ(gas, B0, state_var_B);
+  StateToQ(gas, state_B0, A0_test, state_var_A);
+
+  snprintf(buf, sizeof buf, "%s->%s->%s: %s", A_initial, B_initial, A_initial, A_initial);
+  PetscCall(CheckQWithTolerance(A0, A0_test, A0, buf, rtol_0, rtol_u, rtol_4));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Verify `StateFromQ_fwd` via a finite difference approximation
+static PetscErrorCode TestState_fwd(StateVariable state_var_A, StateVariable state_var_B, NewtonianIdealGasContext gas, const CeedScalar A0[5],
+                                    CeedScalar rtol_0, CeedScalar rtol_u, CeedScalar rtol_4) {
+  CeedScalar        eps = 4e-7;  // Finite difference step
+  char              buf[128];
+  const char *const StateVariables_Initial[] = {"U", "Y", "V"};
+
+  PetscFunctionBeginUser;
+  const char *A_initial = StateVariables_Initial[state_var_A];
+  const char *B_initial = StateVariables_Initial[state_var_B];
+  State       state_0   = StateFromQ(gas, A0, state_var_A);
+
+  for (int i = 0; i < 5; i++) {
+    CeedScalar dB[5] = {0.}, dB_fd[5] = {0.};
+    {  // Calculate dB using State functions
+      CeedScalar dA[5] = {0};
+
+      dA[i]          = A0[i];
+      State dstate_0 = StateFromQ_fwd(gas, state_0, dA, state_var_A);
+      StateToQ_fwd(gas, state_0, dstate_0, dB, state_var_B);
+    }
+
+    {  // Calculate dB_fd via finite difference approximation
+      CeedScalar A1[5], B0[5], B1[5];
+
+      for (int j = 0; j < 5; j++) A1[j] = (1 + eps * (i == j)) * A0[j];
+      State state_1 = StateFromQ(gas, A1, state_var_A);
+      StateToQ(gas, state_0, B0, state_var_B);
+      StateToQ(gas, state_1, B1, state_var_B);
+      for (int j = 0; j < 5; j++) dB_fd[j] = (B1[j] - B0[j]) / eps;
+    }
+
+    snprintf(buf, sizeof buf, "d%s->d%s: StateFrom%s_fwd i=%d: d%s", A_initial, B_initial, A_initial, i, B_initial);
+    PetscCall(CheckQWithTolerance(dB_fd, dB, dB_fd, buf, rtol_0, rtol_u, rtol_4));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+// @brief Test the Newtonian State transformation functions, `StateFrom*`
 static PetscErrorCode UnitTests_Newtonian(User user, NewtonianIdealGasContext gas) {
   Units            units = user->units;
-  const CeedScalar eps   = 1e-6;
-  const CeedScalar kg = units->kilogram, m = units->meter, sec = units->second, Pascal = units->Pascal;
+  const CeedScalar kg = units->kilogram, m = units->meter, sec = units->second, K = units->Kelvin;
+
   PetscFunctionBeginUser;
-  const CeedScalar rho = 1.2 * kg / (m * m * m), u = 40 * m / sec;
-  CeedScalar       U[5] = {rho, rho * u, rho * u * 1.1, rho * u * 1.2, 250e3 * Pascal + .5 * rho * u * u};
-  State            s    = StateFromU(gas, U);
-  for (int i = 0; i < 8; i++) {
-    CeedScalar dU[5] = {0};
-    if (i < 5) dU[i] = U[i];
-    State ds = StateFromU_fwd(gas, s, dU);
-    for (int j = 0; j < 5; j++) dU[j] = (1 + eps * (i == j)) * U[j];
-    State          t = StateFromU(gas, dU);
-    StatePrimitive dY;
-    dY.pressure = (t.Y.pressure - s.Y.pressure) / eps;
-    for (int j = 0; j < 3; j++) dY.velocity[j] = (t.Y.velocity[j] - s.Y.velocity[j]) / eps;
-    dY.temperature = (t.Y.temperature - s.Y.temperature) / eps;
-    char buf[128];
-    snprintf(buf, sizeof buf, "StateFromU_fwd i=%d", i);
-    PetscCall(CheckPrimitiveWithTolerance(dY, ds.Y, dY, buf, 5e-6, 1e-6, 1e-6));
+  const CeedScalar T          = 200 * K;
+  const CeedScalar rho        = 1.2 * kg / Cube(m);
+  const CeedScalar P          = (HeatCapacityRatio(gas) - 1) * rho * gas->cv * T;
+  const CeedScalar u_base     = 40 * m / sec;
+  const CeedScalar u[3]       = {u_base, u_base * 1.1, u_base * 1.2};
+  const CeedScalar e_kinetic  = 0.5 * Dot3(u, u);
+  const CeedScalar e_internal = gas->cv * T;
+  const CeedScalar e_total    = e_kinetic + e_internal;
+  const CeedScalar gamma      = HeatCapacityRatio(gas);
+  const CeedScalar entropy    = log(P) - gamma * log(rho);
+  const CeedScalar rho_div_p  = rho / P;
+  const CeedScalar Y0[5]      = {P, u[0], u[1], u[2], T};
+  const CeedScalar U0[5]      = {rho, rho * u[0], rho * u[1], rho * u[2], rho * e_total};
+  const CeedScalar V0[5]      = {(gamma - entropy) / (gamma - 1) - rho_div_p * (e_kinetic), rho_div_p * u[0], rho_div_p * u[1], rho_div_p * u[2],
+                                 -rho_div_p};
+
+  {
+    CeedScalar rtol = 40 * CEED_EPSILON;
+
+    PetscCall(TestState(STATEVAR_PRIMITIVE, STATEVAR_CONSERVATIVE, gas, Y0, rtol, rtol, rtol));
+    PetscCall(TestState(STATEVAR_PRIMITIVE, STATEVAR_ENTROPY, gas, Y0, rtol, rtol, rtol));
+    PetscCall(TestState(STATEVAR_CONSERVATIVE, STATEVAR_PRIMITIVE, gas, U0, rtol, rtol, rtol));
+    PetscCall(TestState(STATEVAR_CONSERVATIVE, STATEVAR_ENTROPY, gas, U0, rtol, rtol, rtol));
+    PetscCall(TestState(STATEVAR_ENTROPY, STATEVAR_CONSERVATIVE, gas, V0, rtol, rtol, rtol));
+    PetscCall(TestState(STATEVAR_ENTROPY, STATEVAR_PRIMITIVE, gas, V0, rtol, rtol, rtol));
+  }
+
+  {
+    CeedScalar rtol = 5e-6;
+
+    PetscCall(TestState_fwd(STATEVAR_PRIMITIVE, STATEVAR_CONSERVATIVE, gas, Y0, rtol, rtol, rtol));
+    PetscCall(TestState_fwd(STATEVAR_PRIMITIVE, STATEVAR_ENTROPY, gas, Y0, rtol, rtol, rtol));
+    PetscCall(TestState_fwd(STATEVAR_CONSERVATIVE, STATEVAR_PRIMITIVE, gas, U0, rtol, rtol, rtol));
+    PetscCall(TestState_fwd(STATEVAR_CONSERVATIVE, STATEVAR_ENTROPY, gas, U0, 10 * rtol, rtol, rtol));
+    PetscCall(TestState_fwd(STATEVAR_ENTROPY, STATEVAR_CONSERVATIVE, gas, V0, 5 * rtol, rtol, rtol));
+    PetscCall(TestState_fwd(STATEVAR_ENTROPY, STATEVAR_PRIMITIVE, gas, V0, 5 * rtol, 5 * rtol, 5 * rtol));
   }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -81,14 +171,12 @@ PetscErrorCode CreateKSPMassOperator_NewtonianStabilized(User user, CeedOperator
     CeedOperatorField field;
     PetscInt          sub_op_index = 0;  // will be 0 for the volume op
 
-    PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(user->op_rhs_ctx->op, &sub_ops));
+    PetscCallCeed(ceed, CeedOperatorCompositeGetSubList(user->op_rhs_ctx->op, &sub_ops));
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q));
-    PetscCallCeed(ceed, CeedOperatorFieldGetBasis(field, &basis_q));
+    PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_q, &basis_q, NULL));
 
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "qdata", &field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_qd_i));
-    PetscCallCeed(ceed, CeedOperatorFieldGetVector(field, &q_data));
+    PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_qd_i, NULL, &q_data));
 
     PetscCallCeed(ceed, CeedOperatorGetContext(sub_ops[sub_op_index], &qf_ctx));
   }
@@ -113,9 +201,15 @@ PetscErrorCode CreateKSPMassOperator_NewtonianStabilized(User user, CeedOperator
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "Grad_v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
 
+  PetscCallCeed(ceed, CeedVectorDestroy(&q_data));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd_i));
+  PetscCallCeed(ceed, CeedBasisDestroy(&basis_q));
+  PetscCallCeed(ceed, CeedQFunctionContextDestroy(&qf_ctx));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
+
 PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   SetupContext             setup_context;
   User                     user   = *(User *)ctx;
@@ -136,17 +230,11 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
   // ------------------------------------------------------
   //           Setup Generic Newtonian IG Problem
   // ------------------------------------------------------
-  problem->dim                     = 3;
-  problem->q_data_size_vol         = 10;
-  problem->q_data_size_sur         = 10;
-  problem->jac_data_size_sur       = 11;
-  problem->setup_vol.qfunction     = Setup;
-  problem->setup_vol.qfunction_loc = Setup_loc;
-  problem->setup_sur.qfunction     = SetupBoundary;
-  problem->setup_sur.qfunction_loc = SetupBoundary_loc;
-  problem->non_zero_time           = PETSC_FALSE;
-  problem->print_info              = PRINT_NEWTONIAN;
-  problem->uses_newtonian          = PETSC_TRUE;
+  problem->dim                          = 3;
+  problem->jac_data_size_sur            = 11;
+  problem->compute_exact_solution_error = PETSC_FALSE;
+  problem->print_info                   = PRINT_NEWTONIAN;
+  problem->uses_newtonian               = PETSC_TRUE;
 
   // ------------------------------------------------------
   //             Create the libCEED context
@@ -169,7 +257,7 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
   for (PetscInt i = 0; i < 3; i++) domain_size[i] = domain_max[i] - domain_min[i];
 
   StatePrimitive reference      = {.pressure = 1.01e5, .velocity = {0}, .temperature = 288.15};
-  CeedScalar     idl_decay_time = -1, idl_start = 0, idl_length = 0;
+  CeedScalar     idl_decay_time = -1, idl_start = 0, idl_length = 0, idl_pressure = reference.pressure;
   PetscBool      idl_enable = PETSC_FALSE;
 
   // ------------------------------------------------------
@@ -205,7 +293,6 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
       problem->apply_inflow_jacobian.qfunction     = BoundaryIntegral_Jacobian_Conserv;
       problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Conserv_loc;
       break;
-
     case STATEVAR_PRIMITIVE:
       problem->ics.qfunction                       = ICsNewtonianIG_Prim;
       problem->ics.qfunction_loc                   = ICsNewtonianIG_Prim_loc;
@@ -218,6 +305,18 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
       problem->apply_inflow_jacobian.qfunction     = BoundaryIntegral_Jacobian_Prim;
       problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Prim_loc;
       break;
+    case STATEVAR_ENTROPY:
+      problem->ics.qfunction                       = ICsNewtonianIG_Entropy;
+      problem->ics.qfunction_loc                   = ICsNewtonianIG_Entropy_loc;
+      problem->apply_vol_ifunction.qfunction       = IFunction_Newtonian_Entropy;
+      problem->apply_vol_ifunction.qfunction_loc   = IFunction_Newtonian_Entropy_loc;
+      problem->apply_vol_ijacobian.qfunction       = IJacobian_Newtonian_Entropy;
+      problem->apply_vol_ijacobian.qfunction_loc   = IJacobian_Newtonian_Entropy_loc;
+      problem->apply_inflow.qfunction              = BoundaryIntegral_Entropy;
+      problem->apply_inflow.qfunction_loc          = BoundaryIntegral_Entropy_loc;
+      problem->apply_inflow_jacobian.qfunction     = BoundaryIntegral_Jacobian_Entropy;
+      problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Entropy_loc;
+      break;
   }
 
   // -- Physics
@@ -229,8 +328,6 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
 
   PetscInt dim = problem->dim;
   PetscCall(PetscOptionsDeprecated("-g", "-gravity", "libCEED 0.11.1", NULL));
-  PetscCall(PetscOptionsRealArray("-g", "Gravitational acceleration vector", NULL, g, &dim, &given_option));
-  dim = problem->dim;
   PetscCall(PetscOptionsRealArray("-gravity", "Gravitational acceleration vector", NULL, g, &dim, &given_option));
   if (given_option) PetscCheck(dim == 3, comm, PETSC_ERR_ARG_SIZ, "Gravity vector must be size 3, %" PetscInt_FMT " values given", dim);
 
@@ -269,6 +366,9 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
   if (idl_decay_time < 0) idl_enable = PETSC_FALSE;
   PetscCall(PetscOptionsScalar("-idl_start", "Start of IDL in the x direction", NULL, idl_start, &idl_start, NULL));
   PetscCall(PetscOptionsScalar("-idl_length", "Length of IDL in the positive x direction", NULL, idl_length, &idl_length, NULL));
+  idl_pressure = reference.pressure;
+  PetscCall(PetscOptionsScalar("-idl_pressure", "Pressure IDL uses as reference (default is `-reference_pressure`)", NULL, idl_pressure,
+                               &idl_pressure, NULL));
   PetscOptionsEnd();
 
   if (stab == STAB_SUPG && !implicit) problem->create_mass_operator = CreateKSPMassOperator_NewtonianStabilized;
@@ -322,15 +422,14 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData problem, DM dm, void *ctx, SimpleBC b
   newtonian_ig_ctx->Ctau_C        = Ctau_C;
   newtonian_ig_ctx->Ctau_M        = Ctau_M;
   newtonian_ig_ctx->Ctau_E        = Ctau_E;
-  newtonian_ig_ctx->P0            = reference.pressure;
   newtonian_ig_ctx->stabilization = stab;
-  newtonian_ig_ctx->P0            = reference.pressure;
   newtonian_ig_ctx->is_implicit   = implicit;
   newtonian_ig_ctx->state_var     = state_var;
   newtonian_ig_ctx->idl_enable    = idl_enable;
   newtonian_ig_ctx->idl_amplitude = 1 / (idl_decay_time * second);
   newtonian_ig_ctx->idl_start     = idl_start * meter;
   newtonian_ig_ctx->idl_length    = idl_length * meter;
+  newtonian_ig_ctx->idl_pressure  = idl_pressure;
   PetscCall(PetscArraycpy(newtonian_ig_ctx->g, g, 3));
 
   // -- Setup Context
diff --git a/examples/fluids/problems/sgs_dd_model.c b/examples/fluids/problems/sgs_dd_model.c
deleted file mode 100644
index 3f5f3cddb6..0000000000
--- a/examples/fluids/problems/sgs_dd_model.c
+++ /dev/null
@@ -1,594 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/sgs_dd_model.h"
-
-#include <petscdmplex.h>
-
-#include "../navierstokes.h"
-
-typedef struct {
-  CeedElemRestriction  elem_restr_grid_aniso, elem_restr_sgs;
-  CeedVector           grid_aniso_ceed;
-  CeedQFunctionContext sgsdd_qfctx, ifunction_qfctx;
-} *SgsDDSetupData;
-
-PetscErrorCode SgsDDSetupDataDestroy(SgsDDSetupData sgs_dd_setup_data) {
-  Ceed ceed;
-
-  PetscFunctionBeginUser;
-  PetscCall(CeedElemRestrictionGetCeed(sgs_dd_setup_data->elem_restr_sgs, &ceed));
-
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&sgs_dd_setup_data->elem_restr_grid_aniso));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&sgs_dd_setup_data->elem_restr_sgs));
-  PetscCallCeed(ceed, CeedVectorDestroy(&sgs_dd_setup_data->grid_aniso_ceed));
-  PetscCallCeed(ceed, CeedQFunctionContextDestroy(&sgs_dd_setup_data->sgsdd_qfctx));
-  PetscCallCeed(ceed, CeedQFunctionContextDestroy(&sgs_dd_setup_data->ifunction_qfctx));
-  PetscCall(PetscFree(sgs_dd_setup_data));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Create DM for storing subgrid stress at nodes
-static PetscErrorCode SgsDDCreateDM(DM dm_source, DM *dm_sgs, PetscInt degree, PetscInt q_extra, PetscInt *num_components) {
-  PetscSection section;
-
-  PetscFunctionBeginUser;
-  *num_components = 6;
-
-  PetscCall(DMClone(dm_source, dm_sgs));
-  PetscCall(PetscObjectSetName((PetscObject)*dm_sgs, "Subgrid Stress Projection"));
-
-  PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, degree, 1, q_extra, 1, num_components, *dm_sgs));
-
-  PetscCall(DMGetLocalSection(*dm_sgs, &section));
-  PetscCall(PetscSectionSetFieldName(section, 0, ""));
-  PetscCall(PetscSectionSetComponentName(section, 0, 0, "KMSubgridStressXX"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 1, "KMSubgridStressYY"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 2, "KMSubgridStressZZ"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 3, "KMSubgridStressYZ"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 4, "KMSubgridStressXZ"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 5, "KMSubgridStressXY"));
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-// @brief Evaluate data-driven SGS using fused method
-static PetscErrorCode SgsDDNodalStressEval_Fused(User user, Vec Q_loc, Vec VelocityGradient, Vec SGSNodal_loc) {
-  SgsDDData    sgs_dd_data = user->sgs_dd_data;
-  PetscMemType q_mem_type;
-
-  PetscFunctionBeginUser;
-  PetscCall(VecPetscToCeed(Q_loc, &q_mem_type, user->q_ceed));  // q_ceed is an implicit input
-
-  PetscCall(ApplyCeedOperatorGlobalToLocal(VelocityGradient, SGSNodal_loc, sgs_dd_data->op_nodal_evaluation_ctx));
-
-  PetscCall(VecCeedToPetsc(user->q_ceed, q_mem_type, Q_loc));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Create CeedOperator to calculate data-drive SGS at nodes using fused operator
-static PetscErrorCode SgsDDSetupNodalEvaluation_Fused(Ceed ceed, User user, CeedData ceed_data, SgsDDSetupData sgs_dd_setup_data) {
-  SgsDDData           sgs_dd_data = user->sgs_dd_data;
-  CeedQFunction       qf_sgs_dd_nodal;
-  CeedOperator        op_sgs_dd_nodal;
-  CeedInt             num_comp_q, num_comp_grad_velo, num_comp_x, num_comp_grid_aniso;
-  PetscInt            dim;
-  CeedVector          inv_multiplicity;
-  CeedElemRestriction elem_restr_inv_multiplicity, elem_restr_grad_velo, elem_restr_sgs;
-  DMLabel             domain_label = NULL;
-  PetscInt            label_value = 0, height = 0, dm_field = 0;
-
-  PetscFunctionBeginUser;
-  PetscCall(DMGetDimension(user->dm, &dim));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_x, &num_comp_x));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_q, &num_comp_q));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(sgs_dd_setup_data->elem_restr_grid_aniso, &num_comp_grid_aniso));
-
-  {  // Get velocity gradient information
-    CeedOperatorField op_field;
-    PetscCallCeed(ceed, CeedOperatorGetFieldByName(user->grad_velo_proj->l2_rhs_ctx->op, "velocity gradient", &op_field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_grad_velo));
-    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_grad_velo, &num_comp_grad_velo));
-  }
-  PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, dm_field, &elem_restr_sgs));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_sgs, &sgs_dd_data->sgs_nodal_ceed, NULL));
-
-  PetscCall(GetInverseMultiplicity(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, dm_field, PETSC_FALSE, &elem_restr_inv_multiplicity,
-                                   &inv_multiplicity));
-
-  // -- Create operator for SGS DD model nodal evaluation
-  switch (user->phys->state_var) {
-    case STATEVAR_PRIMITIVE:
-      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Prim, ComputeSgsDDNodal_Prim_loc, &qf_sgs_dd_nodal));
-      break;
-    case STATEVAR_CONSERVATIVE:
-      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Conserv, ComputeSgsDDNodal_Conserv_loc, &qf_sgs_dd_nodal));
-      break;
-    default:
-      SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "Data-driven SGS nodal evaluation not available for chosen state variable");
-  }
-
-  // Mesh/geometry order and solution basis order may differ, therefore must interpolate
-  CeedBasis basis_x_to_q;
-  PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x, ceed_data->basis_q, &basis_x_to_q));
-
-  PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_nodal, sgs_dd_setup_data->sgsdd_qfctx));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "q", num_comp_q, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "x", num_comp_x, CEED_EVAL_INTERP));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "gradient velocity", num_comp_grad_velo, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "anisotropy tensor", num_comp_grid_aniso, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_nodal, "inverse multiplicity", 1, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_nodal, "km_sgs", sgs_dd_data->num_comp_sgs, CEED_EVAL_NONE));
-
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_nodal, NULL, NULL, &op_sgs_dd_nodal));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "q", ceed_data->elem_restr_q, CEED_BASIS_NONE, user->q_ceed));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "x", ceed_data->elem_restr_x, basis_x_to_q, ceed_data->x_coord));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "gradient velocity", elem_restr_grad_velo, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "anisotropy tensor", sgs_dd_setup_data->elem_restr_grid_aniso, CEED_BASIS_NONE,
-                                           sgs_dd_setup_data->grid_aniso_ceed));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_nodal, "km_sgs", elem_restr_sgs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-  PetscCall(OperatorApplyContextCreate(user->grad_velo_proj->dm, sgs_dd_data->dm_sgs, ceed, op_sgs_dd_nodal, NULL, sgs_dd_data->sgs_nodal_ceed, NULL,
-                                       NULL, &sgs_dd_data->op_nodal_evaluation_ctx));
-
-  sgs_dd_setup_data->elem_restr_sgs = elem_restr_sgs;
-  sgs_dd_data->sgs_nodal_eval       = SgsDDNodalStressEval_Fused;
-
-  PetscCallCeed(ceed, CeedVectorDestroy(&inv_multiplicity));
-  PetscCallCeed(ceed, CeedBasisDestroy(&basis_x_to_q));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_inv_multiplicity));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_nodal));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_nodal));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Setup data-driven model inference using internal (libCEED native) implementation
-static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential_Internal(Ceed ceed, SgsDDData sgs_dd_data, SgsDDSetupData sgs_dd_setup_data,
-                                                                    CeedElemRestriction elem_restr_dd_inputs,
-                                                                    CeedElemRestriction elem_restr_dd_outputs,
-                                                                    CeedElemRestriction elem_restr_inv_multiplicity, CeedVector inv_multiplicity,
-                                                                    void **ctx) {
-  CeedQFunction         qf_sgs_dd_inference;
-  CeedOperator          op_sgs_dd_inference;
-  OperatorApplyContext *op_context = (OperatorApplyContext *)ctx;
-
-  PetscFunctionBeginUser;
-  PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Inference, ComputeSgsDDNodal_Sequential_Inference_loc,
-                                                  &qf_sgs_dd_inference));
-
-  PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_inference, sgs_dd_setup_data->sgsdd_qfctx));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inference, "model inputs", sgs_dd_data->num_comp_inputs, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inference, "inverse multiplicity", 1, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_inference, "model outputs", sgs_dd_data->num_comp_outputs, CEED_EVAL_NONE));
-
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_inference, NULL, NULL, &op_sgs_dd_inference));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inference, "model inputs", elem_restr_dd_inputs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed,
-                CeedOperatorSetField(op_sgs_dd_inference, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inference, "model outputs", elem_restr_dd_outputs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-  PetscCall(OperatorApplyContextCreate(sgs_dd_data->dm_dd_inputs, sgs_dd_data->dm_dd_outputs, ceed, op_sgs_dd_inference, NULL, NULL, NULL, NULL,
-                                       op_context));
-  sgs_dd_data->sgs_nodal_inference_ctx_destroy = (PetscErrorCode(*)(void *))OperatorApplyContextDestroy;
-
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_inference));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_inference));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Perform data-driven model inference using internal (libCEED native) implementation
-PetscErrorCode SgsDDNodalStressEval_Sequential_Internal(Vec DD_Inputs_loc, Vec DD_Outputs_loc, void *ctx) {
-  OperatorApplyContext op_context = *(OperatorApplyContext *)ctx;
-
-  PetscFunctionBeginUser;
-  PetscCall(ApplyCeedOperatorLocalToLocal(DD_Inputs_loc, DD_Outputs_loc, op_context));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Evaluate data-driven SGS using sequential method
-PetscErrorCode SgsDDNodalStressEval_Sequential(User user, Vec Q_loc, Vec VelocityGradient, Vec SGSNodal_loc) {
-  SgsDDData    sgs_dd_data = user->sgs_dd_data;
-  PetscMemType q_mem_type;
-  Vec          DD_Inputs_loc, DD_Outputs_loc;
-
-  PetscFunctionBeginUser;
-  PetscCall(DMGetLocalVector(sgs_dd_data->dm_dd_inputs, &DD_Inputs_loc));
-  PetscCall(DMGetLocalVector(sgs_dd_data->dm_dd_outputs, &DD_Outputs_loc));
-  PetscCall(VecPetscToCeed(Q_loc, &q_mem_type, user->q_ceed));  // q_ceed is an implicit input
-
-  PetscCall(ApplyCeedOperatorGlobalToLocal(VelocityGradient, DD_Inputs_loc, sgs_dd_data->op_nodal_dd_inputs_ctx));
-  PetscCall(sgs_dd_data->sgs_nodal_inference(DD_Inputs_loc, DD_Outputs_loc, &sgs_dd_data->sgs_nodal_inference_ctx));
-  PetscCall(ApplyCeedOperatorLocalToLocal(DD_Outputs_loc, SGSNodal_loc, sgs_dd_data->op_nodal_dd_outputs_ctx));
-
-  PetscCall(VecCeedToPetsc(user->q_ceed, q_mem_type, Q_loc));
-  PetscCall(DMRestoreLocalVector(sgs_dd_data->dm_dd_inputs, &DD_Inputs_loc));
-  PetscCall(DMRestoreLocalVector(sgs_dd_data->dm_dd_outputs, &DD_Outputs_loc));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Create CeedOperator to calculate data-drive SGS at nodes using sequentially-applied operators
-static PetscErrorCode SgsDDSetupNodalEvaluation_Sequential(Ceed ceed, User user, CeedData ceed_data, SgsDDSetupData sgs_dd_setup_data) {
-  SgsDDData           sgs_dd_data = user->sgs_dd_data;
-  CeedInt             num_comp_q, num_comp_grad_velo, num_comp_x, num_comp_grid_aniso, num_comp_eigvec = 9 + 1;
-  PetscInt            dim;
-  CeedVector          inv_multiplicity, eigvec;
-  CeedElemRestriction elem_restr_inv_multiplicity, elem_restr_grad_velo, elem_restr_sgs, elem_restr_eigvec, elem_restr_dd_inputs,
-      elem_restr_dd_outputs;
-  DMLabel  domain_label = NULL;
-  PetscInt label_value = 0, height = 0, dm_field = 0;
-
-  PetscFunctionBeginUser;
-  {  // Create DMs for data-driven input and output values
-    PetscSection section;
-    PetscInt     degree, q_extra;
-    {  // Get degree and number of quadrature points from dm_sgs
-      PetscFE         fe;
-      PetscSpace      basis;
-      PetscQuadrature quadrature;
-      PetscInt        num_qpnts;
-      PetscCall(DMGetField(sgs_dd_data->dm_sgs, 0, NULL, (PetscObject *)&fe));
-      PetscCall(PetscFEGetBasisSpace(fe, &basis));
-      PetscCall(PetscSpaceGetDegree(basis, &degree, NULL));
-      PetscCall(PetscFEGetQuadrature(fe, &quadrature));
-      PetscCall(PetscQuadratureGetOrder(quadrature, &num_qpnts));
-      q_extra = degree - num_qpnts;
-    }
-
-    PetscCall(DMClone(sgs_dd_data->dm_sgs, &sgs_dd_data->dm_dd_inputs));
-    PetscCall(PetscObjectSetName((PetscObject)sgs_dd_data->dm_dd_inputs, "Data-Driven Model Inputs"));
-    PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, degree, 1, q_extra, 1, &sgs_dd_data->num_comp_inputs, sgs_dd_data->dm_dd_inputs));
-    PetscCall(DMGetLocalSection(sgs_dd_data->dm_dd_inputs, &section));
-    PetscCall(PetscSectionSetFieldName(section, 0, ""));
-    for (CeedInt i = 0; i < sgs_dd_data->num_comp_inputs; i++) {
-      char component_name[PETSC_MAX_PATH_LEN];
-
-      PetscCall(PetscSNPrintf(component_name, sizeof component_name, "DataDrivenInput%" CeedInt_FMT, i + 1));
-      PetscCall(PetscSectionSetComponentName(section, 0, i, component_name));
-    }
-
-    PetscCall(DMClone(sgs_dd_data->dm_sgs, &sgs_dd_data->dm_dd_outputs));
-    PetscCall(PetscObjectSetName((PetscObject)sgs_dd_data->dm_dd_outputs, "Data-Driven Model Outputs"));
-    PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, degree, 1, q_extra, 1, &sgs_dd_data->num_comp_outputs, sgs_dd_data->dm_dd_outputs));
-    PetscCall(DMGetLocalSection(sgs_dd_data->dm_dd_outputs, &section));
-    PetscCall(PetscSectionSetFieldName(section, 0, ""));
-    for (CeedInt i = 0; i < sgs_dd_data->num_comp_outputs; i++) {
-      char component_name[PETSC_MAX_PATH_LEN];
-
-      PetscCall(PetscSNPrintf(component_name, sizeof component_name, "DataDrivenOutput%" CeedInt_FMT, i + 1));
-      PetscCall(PetscSectionSetComponentName(section, 0, i, component_name));
-    }
-  }
-
-  PetscCall(DMGetDimension(user->dm, &dim));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_x, &num_comp_x));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_q, &num_comp_q));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(sgs_dd_setup_data->elem_restr_grid_aniso, &num_comp_grid_aniso));
-
-  {  // Get velocity gradient information
-    CeedOperatorField op_field;
-    PetscCallCeed(ceed, CeedOperatorGetFieldByName(user->grad_velo_proj->l2_rhs_ctx->op, "velocity gradient", &op_field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_grad_velo));
-    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_grad_velo, &num_comp_grad_velo));
-    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_grad_velo, &sgs_dd_data->grad_velo_ceed, NULL));
-  }
-
-  PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, dm_field, &elem_restr_sgs));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_sgs, &sgs_dd_data->sgs_nodal_ceed, NULL));
-  PetscCall(
-      DMPlexCeedElemRestrictionCollocatedCreate(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, num_comp_eigvec, &elem_restr_eigvec));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_eigvec, &eigvec, NULL));
-
-  PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_data->dm_dd_inputs, domain_label, label_value, height, dm_field, &elem_restr_dd_inputs));
-  PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_data->dm_dd_outputs, domain_label, label_value, height, dm_field, &elem_restr_dd_outputs));
-
-  PetscCall(GetInverseMultiplicity(ceed, sgs_dd_data->dm_sgs, domain_label, label_value, height, dm_field, PETSC_FALSE, &elem_restr_inv_multiplicity,
-                                   &inv_multiplicity));
-
-  {  // Create operator for data-driven input evaluation
-    CeedQFunction qf_sgs_dd_inputs;
-    CeedOperator  op_sgs_dd_inputs;
-
-    switch (user->phys->state_var) {
-      case STATEVAR_PRIMITIVE:
-        PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Inputs_Prim,
-                                                        ComputeSgsDDNodal_Sequential_Inputs_Prim_loc, &qf_sgs_dd_inputs));
-        break;
-      case STATEVAR_CONSERVATIVE:
-        PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Inputs_Conserv,
-                                                        ComputeSgsDDNodal_Sequential_Inputs_Conserv_loc, &qf_sgs_dd_inputs));
-        break;
-      default:
-        SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP,
-                "Data-driven SGS nodal input evaluation not available for chosen state variable");
-    }
-
-    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_inputs, sgs_dd_setup_data->sgsdd_qfctx));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inputs, "q", num_comp_q, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inputs, "gradient velocity", num_comp_grad_velo, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inputs, "anisotropy tensor", num_comp_grid_aniso, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_inputs, "inverse multiplicity", 1, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_inputs, "eigenvectors", num_comp_eigvec, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_inputs, "model inputs", sgs_dd_data->num_comp_inputs, CEED_EVAL_NONE));
-
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_inputs, NULL, NULL, &op_sgs_dd_inputs));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "q", ceed_data->elem_restr_q, CEED_BASIS_NONE, user->q_ceed));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "gradient velocity", elem_restr_grad_velo, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "anisotropy tensor", sgs_dd_setup_data->elem_restr_grid_aniso, CEED_BASIS_NONE,
-                                             sgs_dd_setup_data->grid_aniso_ceed));
-    PetscCallCeed(ceed,
-                  CeedOperatorSetField(op_sgs_dd_inputs, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "eigenvectors", elem_restr_eigvec, CEED_BASIS_NONE, eigvec));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_inputs, "model inputs", elem_restr_dd_inputs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-    PetscCall(OperatorApplyContextCreate(user->grad_velo_proj->dm, sgs_dd_data->dm_dd_inputs, ceed, op_sgs_dd_inputs, NULL, NULL, NULL, NULL,
-                                         &sgs_dd_data->op_nodal_dd_inputs_ctx));
-    PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_inputs));
-    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_inputs));
-  }
-
-  {  // Create operator for data-driven output handling
-    CeedQFunction qf_sgs_dd_outputs;
-    CeedOperator  op_sgs_dd_outputs;
-
-    PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSgsDDNodal_Sequential_Outputs, ComputeSgsDDNodal_Sequential_Outputs_loc,
-                                                    &qf_sgs_dd_outputs));
-    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_outputs, sgs_dd_setup_data->sgsdd_qfctx));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_outputs, "model outputs", sgs_dd_data->num_comp_outputs, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_outputs, "anisotropy tensor", num_comp_grid_aniso, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_outputs, "inverse multiplicity", 1, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_outputs, "eigenvectors", num_comp_eigvec, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_outputs, "km_sgs", sgs_dd_data->num_comp_sgs, CEED_EVAL_NONE));
-
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_outputs, NULL, NULL, &op_sgs_dd_outputs));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_outputs, "model outputs", elem_restr_dd_outputs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_outputs, "anisotropy tensor", sgs_dd_setup_data->elem_restr_grid_aniso, CEED_BASIS_NONE,
-                                             sgs_dd_setup_data->grid_aniso_ceed));
-    PetscCallCeed(ceed,
-                  CeedOperatorSetField(op_sgs_dd_outputs, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_outputs, "eigenvectors", elem_restr_eigvec, CEED_BASIS_NONE, eigvec));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_outputs, "km_sgs", elem_restr_sgs, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-    PetscCall(OperatorApplyContextCreate(sgs_dd_data->dm_dd_outputs, sgs_dd_data->dm_sgs, ceed, op_sgs_dd_outputs, NULL, sgs_dd_data->sgs_nodal_ceed,
-                                         NULL, NULL, &sgs_dd_data->op_nodal_dd_outputs_ctx));
-    PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_outputs));
-    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_outputs));
-  }
-
-  sgs_dd_data->sgs_nodal_inference = SgsDDNodalStressEval_Sequential_Internal;
-  sgs_dd_data->sgs_nodal_eval      = SgsDDNodalStressEval_Sequential;
-  PetscCall(SgsDDSetupNodalEvaluation_Sequential_Internal(ceed, sgs_dd_data, sgs_dd_setup_data, elem_restr_dd_inputs, elem_restr_dd_outputs,
-                                                          elem_restr_inv_multiplicity, inv_multiplicity, &sgs_dd_data->sgs_nodal_inference_ctx));
-
-  sgs_dd_setup_data->elem_restr_sgs = elem_restr_sgs;
-
-  PetscCallCeed(ceed, CeedVectorDestroy(&inv_multiplicity));
-  PetscCallCeed(ceed, CeedVectorDestroy(&eigvec));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_inv_multiplicity));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_eigvec));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_dd_inputs));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_dd_outputs));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Create CeedOperator to compute SGS contribution to the residual
-static PetscErrorCode SgsSetupNodalIFunction(Ceed ceed, User user, CeedData ceed_data, SgsDDSetupData sgs_dd_setup_data) {
-  SgsDDData     sgs_dd_data = user->sgs_dd_data;
-  CeedInt       num_comp_q, num_comp_qd, num_comp_x;
-  PetscInt      dim;
-  CeedQFunction qf_sgs_apply;
-  CeedOperator  op_sgs_apply;
-  CeedBasis     basis_sgs;
-
-  PetscFunctionBeginUser;
-  PetscCall(DMGetDimension(user->dm, &dim));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_q, &num_comp_q));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_qd_i, &num_comp_qd));
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(ceed_data->elem_restr_x, &num_comp_x));
-
-  PetscCall(CreateBasisFromPlex(ceed, sgs_dd_data->dm_sgs, 0, 0, 0, 0, &basis_sgs));
-
-  switch (user->phys->state_var) {
-    case STATEVAR_PRIMITIVE:
-      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, IFunction_NodalSgs_Prim, IFunction_NodalSgs_Prim_loc, &qf_sgs_apply));
-      break;
-    case STATEVAR_CONSERVATIVE:
-      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, IFunction_NodalSgs_Conserv, IFunction_NodalSgs_Conserv_loc, &qf_sgs_apply));
-      break;
-    default:
-      SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "Nodal SGS evaluation not available for chosen state variable");
-  }
-
-  PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_apply, sgs_dd_setup_data->ifunction_qfctx));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_apply, "q", num_comp_q, CEED_EVAL_INTERP));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_apply, "qdata", num_comp_qd, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_apply, "km_sgs", sgs_dd_data->num_comp_sgs, CEED_EVAL_INTERP));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_apply, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
-
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_apply, NULL, NULL, &op_sgs_apply));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_apply, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_apply, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_apply, "km_sgs", sgs_dd_setup_data->elem_restr_sgs, basis_sgs, sgs_dd_data->sgs_nodal_ceed));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_apply, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-
-  PetscCall(
-      OperatorApplyContextCreate(user->dm, user->dm, ceed, op_sgs_apply, user->q_ceed, user->g_ceed, NULL, NULL, &sgs_dd_data->op_sgs_apply_ctx));
-
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_apply));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_apply));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Calculate and add data-driven SGS residual to the global residual
-PetscErrorCode SgsDDApplyIFunction(User user, const Vec Q_loc, Vec G_loc) {
-  SgsDDData    sgs_dd_data = user->sgs_dd_data;
-  Vec          VelocityGradient, SGSNodal_loc;
-  PetscMemType sgs_nodal_mem_type;
-
-  PetscFunctionBeginUser;
-  PetscCall(DMGetGlobalVector(user->grad_velo_proj->dm, &VelocityGradient));
-  PetscCall(VelocityGradientProjectionApply(user->grad_velo_proj, Q_loc, VelocityGradient));
-
-  // -- Compute Nodal SGS tensor
-  PetscCall(DMGetLocalVector(sgs_dd_data->dm_sgs, &SGSNodal_loc));
-  PetscCall(sgs_dd_data->sgs_nodal_eval(user, Q_loc, VelocityGradient, SGSNodal_loc));
-
-  // -- Compute contribution of the SGS stress
-  PetscCall(VecPetscToCeed(SGSNodal_loc, &sgs_nodal_mem_type, sgs_dd_data->sgs_nodal_ceed));  // sgs_nodal_ceed is an implicit input
-  PetscCall(ApplyAddCeedOperatorLocalToLocal(Q_loc, G_loc, sgs_dd_data->op_sgs_apply_ctx));
-
-  // -- Return local SGS vector
-  PetscCall(VecCeedToPetsc(sgs_dd_data->sgs_nodal_ceed, sgs_nodal_mem_type, SGSNodal_loc));
-  PetscCall(DMRestoreLocalVector(sgs_dd_data->dm_sgs, &SGSNodal_loc));
-  PetscCall(DMRestoreGlobalVector(user->grad_velo_proj->dm, &VelocityGradient));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief B = A^T, A is NxM, B is MxN
-static PetscErrorCode TransposeMatrix(const PetscScalar *A, PetscScalar *B, const PetscInt N, const PetscInt M) {
-  PetscFunctionBeginUser;
-  for (PetscInt i = 0; i < N; i++) {
-    for (PetscInt j = 0; j < M; j++) {
-      B[j * N + i] = A[i * M + j];
-    }
-  }
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Read neural network coefficients from file and put into context struct
-static PetscErrorCode SgsDDContextFill(MPI_Comm comm, char data_dir[PETSC_MAX_PATH_LEN], SgsDDContext *psgsdd_ctx) {
-  SgsDDContext sgsdd_ctx;
-  PetscInt     num_inputs = (*psgsdd_ctx)->num_inputs, num_outputs = (*psgsdd_ctx)->num_outputs, num_neurons = (*psgsdd_ctx)->num_neurons;
-  char         file_path[PETSC_MAX_PATH_LEN];
-  PetscScalar *temp;
-
-  PetscFunctionBeginUser;
-  {
-    SgsDDContext sgsdd_temp;
-    PetscCall(PetscNew(&sgsdd_temp));
-    *sgsdd_temp                     = **psgsdd_ctx;
-    sgsdd_temp->offsets.bias1       = 0;
-    sgsdd_temp->offsets.bias2       = sgsdd_temp->offsets.bias1 + num_neurons;
-    sgsdd_temp->offsets.weight1     = sgsdd_temp->offsets.bias2 + num_neurons;
-    sgsdd_temp->offsets.weight2     = sgsdd_temp->offsets.weight1 + num_neurons * num_inputs;
-    sgsdd_temp->offsets.out_scaling = sgsdd_temp->offsets.weight2 + num_inputs * num_neurons;
-    PetscInt total_num_scalars      = sgsdd_temp->offsets.out_scaling + 2 * num_outputs;
-    sgsdd_temp->total_bytes         = sizeof(*sgsdd_ctx) + total_num_scalars * sizeof(sgsdd_ctx->data[0]);
-    PetscCall(PetscMalloc(sgsdd_temp->total_bytes, &sgsdd_ctx));
-    *sgsdd_ctx = *sgsdd_temp;
-    PetscCall(PetscFree(sgsdd_temp));
-  }
-
-  PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "b1.dat"));
-  PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, &sgsdd_ctx->data[sgsdd_ctx->offsets.bias1]));
-  PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "b2.dat"));
-  PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, &sgsdd_ctx->data[sgsdd_ctx->offsets.bias2]));
-  PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "OutScaling.dat"));
-  PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, &sgsdd_ctx->data[sgsdd_ctx->offsets.out_scaling]));
-
-  {
-    PetscCall(PetscMalloc1(num_inputs * num_neurons, &temp));
-    PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "w1.dat"));
-    PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, temp));
-    PetscCall(TransposeMatrix(temp, &sgsdd_ctx->data[sgsdd_ctx->offsets.weight1], num_inputs, num_neurons));
-    PetscCall(PetscFree(temp));
-  }
-  {
-    PetscCall(PetscMalloc1(num_outputs * num_neurons, &temp));
-    PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/%s", data_dir, "w2.dat"));
-    PetscCall(PhastaDatFileReadToArrayReal(comm, file_path, temp));
-    PetscCall(TransposeMatrix(temp, &sgsdd_ctx->data[sgsdd_ctx->offsets.weight2], num_neurons, num_outputs));
-    PetscCall(PetscFree(temp));
-  }
-
-  PetscCall(PetscFree(*psgsdd_ctx));
-  *psgsdd_ctx = sgsdd_ctx;
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SgsDDSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem) {
-  PetscReal                alpha = 0;
-  SgsDDContext             sgsdd_ctx;
-  MPI_Comm                 comm                           = user->comm;
-  char                     sgs_dd_dir[PETSC_MAX_PATH_LEN] = "./dd_sgs_parameters";
-  SgsDDSetupData           sgs_dd_setup_data;
-  PetscBool                use_fused;
-  NewtonianIdealGasContext gas;
-
-  PetscFunctionBeginUser;
-  PetscCall(VelocityGradientProjectionSetup(ceed, user, ceed_data, problem, user->phys->state_var, ceed_data->elem_restr_q, ceed_data->basis_q,
-                                            &user->grad_velo_proj));
-
-  PetscCall(PetscNew(&user->sgs_dd_data));
-  user->sgs_dd_data->num_comp_inputs  = 6;
-  user->sgs_dd_data->num_comp_outputs = 6;
-
-  use_fused = PETSC_TRUE;
-  PetscOptionsBegin(comm, NULL, "SGS Data-Driven Model Options", NULL);
-  PetscCall(PetscOptionsReal("-sgs_model_dd_leakyrelu_alpha", "Slope parameter for Leaky ReLU activation function", NULL, alpha, &alpha, NULL));
-  PetscCall(PetscOptionsString("-sgs_model_dd_parameter_dir", "Path to directory with model parameters (weights, biases, etc.)", NULL, sgs_dd_dir,
-                               sgs_dd_dir, sizeof(sgs_dd_dir), NULL));
-  PetscCall(
-      PetscOptionsBool("-sgs_model_dd_use_fused", "Use the fused SGS DD model evaluation instead of sequential", NULL, use_fused, &use_fused, NULL));
-  PetscOptionsEnd();
-
-  PetscCall(PetscNew(&sgsdd_ctx));
-  sgsdd_ctx->num_layers  = 1;
-  sgsdd_ctx->num_inputs  = 6;
-  sgsdd_ctx->num_outputs = 6;
-  sgsdd_ctx->num_neurons = 20;
-  sgsdd_ctx->alpha       = alpha;
-
-  PetscCall(SgsDDContextFill(comm, sgs_dd_dir, &sgsdd_ctx));
-
-  // -- Create DM for storing SGS tensor at nodes
-  PetscCall(SgsDDCreateDM(user->dm, &user->sgs_dd_data->dm_sgs, user->app_ctx->degree, user->app_ctx->q_extra, &user->sgs_dd_data->num_comp_sgs));
-
-  PetscCall(PetscNew(&sgs_dd_setup_data));
-
-  PetscCallCeed(ceed, CeedQFunctionContextGetDataRead(problem->apply_vol_ifunction.qfunction_context, CEED_MEM_HOST, &gas));
-  sgsdd_ctx->gas = *gas;
-  PetscCallCeed(ceed, CeedQFunctionContextRestoreDataRead(problem->apply_vol_ifunction.qfunction_context, &gas));
-  PetscCallCeed(ceed, CeedQFunctionContextCreate(user->ceed, &sgs_dd_setup_data->sgsdd_qfctx));
-  PetscCallCeed(ceed,
-                CeedQFunctionContextSetData(sgs_dd_setup_data->sgsdd_qfctx, CEED_MEM_HOST, CEED_USE_POINTER, sgsdd_ctx->total_bytes, sgsdd_ctx));
-  PetscCallCeed(ceed, CeedQFunctionContextSetDataDestroy(sgs_dd_setup_data->sgsdd_qfctx, CEED_MEM_HOST, FreeContextPetsc));
-
-  PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(problem->apply_vol_ifunction.qfunction_context, &sgs_dd_setup_data->ifunction_qfctx));
-
-  // -- Compute and store anisotropy tensor
-  PetscCall(GridAnisotropyTensorProjectionSetupApply(ceed, user, ceed_data, &sgs_dd_setup_data->elem_restr_grid_aniso,
-                                                     &sgs_dd_setup_data->grid_aniso_ceed));
-
-  // -- Create Nodal Evaluation Operator
-  if (use_fused) PetscCall(SgsDDSetupNodalEvaluation_Fused(ceed, user, ceed_data, sgs_dd_setup_data));
-  else PetscCall(SgsDDSetupNodalEvaluation_Sequential(ceed, user, ceed_data, sgs_dd_setup_data));
-
-  // -- Create Operator to evalutate residual of SGS stress
-  PetscCall(SgsSetupNodalIFunction(ceed, user, ceed_data, sgs_dd_setup_data));
-
-  PetscCall(SgsDDSetupDataDestroy(sgs_dd_setup_data));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SgsDDDataDestroy(SgsDDData sgs_dd_data) {
-  PetscFunctionBeginUser;
-  if (!sgs_dd_data) PetscFunctionReturn(PETSC_SUCCESS);
-  Ceed ceed = sgs_dd_data->op_sgs_apply_ctx->ceed;
-
-  PetscCallCeed(ceed, CeedVectorDestroy(&sgs_dd_data->sgs_nodal_ceed));
-  PetscCallCeed(ceed, CeedVectorDestroy(&sgs_dd_data->grad_velo_ceed));
-  PetscCall(OperatorApplyContextDestroy(sgs_dd_data->op_nodal_evaluation_ctx));
-  PetscCall(OperatorApplyContextDestroy(sgs_dd_data->op_sgs_apply_ctx));
-  PetscCall(OperatorApplyContextDestroy(sgs_dd_data->op_nodal_dd_inputs_ctx));
-  PetscCall(OperatorApplyContextDestroy(sgs_dd_data->op_nodal_dd_outputs_ctx));
-  PetscCall(DMDestroy(&sgs_dd_data->dm_sgs));
-  PetscCall(DMDestroy(&sgs_dd_data->dm_dd_inputs));
-  PetscCall(DMDestroy(&sgs_dd_data->dm_dd_outputs));
-  if (sgs_dd_data->sgs_nodal_inference_ctx) PetscCall(sgs_dd_data->sgs_nodal_inference_ctx_destroy(sgs_dd_data->sgs_nodal_inference_ctx));
-  PetscCall(PetscFree(sgs_dd_data));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
diff --git a/examples/fluids/problems/shocktube.c b/examples/fluids/problems/shocktube.c
index 36d2b991e9..462f9cebb8 100644
--- a/examples/fluids/problems/shocktube.c
+++ b/examples/fluids/problems/shocktube.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -14,7 +14,6 @@
 #include <petscdm.h>
 
 #include "../navierstokes.h"
-#include "../qfunctions/setupgeo.h"
 
 PetscErrorCode NS_SHOCKTUBE(ProblemData problem, DM dm, void *ctx, SimpleBC bc) {
   SetupContextShock    setup_context;
@@ -35,19 +34,13 @@ PetscErrorCode NS_SHOCKTUBE(ProblemData problem, DM dm, void *ctx, SimpleBC bc)
   //               SET UP SHOCKTUBE
   // ------------------------------------------------------
   problem->dim                               = 3;
-  problem->q_data_size_vol                   = 10;
-  problem->q_data_size_sur                   = 4;
-  problem->setup_vol.qfunction               = Setup;
-  problem->setup_vol.qfunction_loc           = Setup_loc;
-  problem->setup_sur.qfunction               = SetupBoundary;
-  problem->setup_sur.qfunction_loc           = SetupBoundary_loc;
   problem->ics.qfunction                     = ICsShockTube;
   problem->ics.qfunction_loc                 = ICsShockTube_loc;
   problem->apply_vol_rhs.qfunction           = EulerShockTube;
   problem->apply_vol_rhs.qfunction_loc       = EulerShockTube_loc;
   problem->apply_vol_ifunction.qfunction     = NULL;
   problem->apply_vol_ifunction.qfunction_loc = NULL;
-  problem->non_zero_time                     = PETSC_FALSE;
+  problem->compute_exact_solution_error      = PETSC_FALSE;
   problem->print_info                        = PRINT_SHOCKTUBE;
 
   // ------------------------------------------------------
diff --git a/examples/fluids/problems/stg_shur14.c b/examples/fluids/problems/stg_shur14.c
index 5d9d4f1dc4..ca8dd2d10f 100644
--- a/examples/fluids/problems/stg_shur14.c
+++ b/examples/fluids/problems/stg_shur14.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -99,6 +99,7 @@ static PetscErrorCode ReadStgInflow(const MPI_Comm comm, const char path[PETSC_M
     PetscCheck(wall_dist[i] >= 0, comm, PETSC_ERR_FILE_UNEXPECTED, "Distance to wall in %s cannot be negative", path);
     PetscCheck(lt[i] >= 0, comm, PETSC_ERR_FILE_UNEXPECTED, "Turbulent length scale in %s cannot be negative", path);
     PetscCheck(eps[i] >= 0, comm, PETSC_ERR_FILE_UNEXPECTED, "Turbulent dissipation in %s cannot be negative", path);
+    PetscCall(PetscStrToArrayDestroy(ndims, array));
   }
   CeedScalar(*cij)[stg_ctx->nprofs] = (CeedScalar(*)[stg_ctx->nprofs]) & stg_ctx->data[stg_ctx->offsets.cij];
   PetscCall(CalcCholeskyDecomp(comm, stg_ctx->nprofs, rij, cij));
@@ -144,6 +145,7 @@ static PetscErrorCode ReadStgRand(const MPI_Comm comm, const char path[PETSC_MAX
     sigma[0][i] = (CeedScalar)atof(array[4]);
     sigma[1][i] = (CeedScalar)atof(array[5]);
     sigma[2][i] = (CeedScalar)atof(array[6]);
+    PetscCall(PetscStrToArrayDestroy(ndims, array));
   }
   PetscCall(PetscFClose(comm, fp));
   PetscFunctionReturn(PETSC_SUCCESS);
@@ -221,7 +223,7 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U
   char                     stg_inflow_path[PETSC_MAX_PATH_LEN] = "./STGInflow.dat";
   char                     stg_rand_path[PETSC_MAX_PATH_LEN]   = "./STGRand.dat";
   PetscBool                mean_only = PETSC_FALSE, use_stgstrong = PETSC_FALSE, use_fluctuating_IC = PETSC_FALSE, given_stg_dx = PETSC_FALSE;
-  CeedScalar               u0 = 0.0, alpha = 1.01, stg_dx = 1.0e-3;
+  CeedScalar               u0 = 0.0, alpha = 1.01, stg_dx = -1, stg_h_scale_factor = 1 / user->app_ctx->degree;
   CeedQFunctionContext     stg_context;
   NewtonianIdealGasContext newtonian_ig_ctx;
 
@@ -235,7 +237,11 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U
   PetscCall(PetscOptionsBool("-stg_strong", "Enforce STG inflow strongly", NULL, use_stgstrong, &use_stgstrong, NULL));
   PetscCall(PetscOptionsBool("-stg_fluctuating_IC", "\"Extrude\" the fluctuations through the domain as an initial condition", NULL,
                              use_fluctuating_IC, &use_fluctuating_IC, NULL));
-  PetscCall(PetscOptionsReal("-stg_dx", "Element size in streamwise direction at inflow", NULL, stg_dx, &stg_dx, &given_stg_dx));
+  PetscCall(PetscOptionsReal("-stg_dx", "Element length in x direction at inflow", NULL, stg_dx, &stg_dx, &given_stg_dx));
+  PetscCall(PetscOptionsReal("-stg_h_scale_factor", "Scale element size for cutoff frequency calculation", NULL, stg_h_scale_factor,
+                             &stg_h_scale_factor, NULL));
+  PetscCall(PetscOptionsDeprecated("-stg_dyScale", NULL, "libCEED 0.12.0", "Use -stg_h_scale_factor to scale all the element dimensions"));
+  PetscCall(PetscOptionsDeprecated("-stg_dz", NULL, "libCEED 0.12.0", NULL));
   PetscOptionsEnd();
 
   PetscCall(PetscCalloc1(1, &global_stg_ctx));
@@ -247,6 +253,7 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U
   global_stg_ctx->use_fluctuating_IC = use_fluctuating_IC;
   global_stg_ctx->theta0             = theta0;
   global_stg_ctx->P0                 = P0;
+  global_stg_ctx->h_scale_factor     = stg_h_scale_factor;
 
   {  // Calculate dx assuming constant spacing
     PetscReal domain_min[3], domain_max[3], domain_size[3];
@@ -256,6 +263,8 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U
     PetscInt nmax = 3, faces[3];
     PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-dm_plex_box_faces", faces, &nmax, NULL));
     global_stg_ctx->dx = given_stg_dx ? stg_dx : domain_size[0] / faces[0];
+    PetscCheck((global_stg_ctx->dx > 0) && PetscIsNormalReal((PetscReal)global_stg_ctx->dx), comm, PETSC_ERR_LIB,
+               "STG dx must be positive normal number, got %g", global_stg_ctx->dx);
   }
 
   PetscCallCeed(ceed, CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ig_ctx));
@@ -278,7 +287,7 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U
   if (use_stgstrong) {
     // Use default boundary integral QF (BoundaryIntegral) in newtonian.h
     problem->use_strong_bc_ceed = PETSC_TRUE;
-    problem->bc_from_ics        = PETSC_FALSE;
+    problem->set_bc_from_ics    = PETSC_FALSE;
   } else {
     problem->apply_inflow.qfunction              = StgShur14Inflow;
     problem->apply_inflow.qfunction_loc          = StgShur14Inflow_loc;
@@ -286,7 +295,7 @@ PetscErrorCode SetupStg(const MPI_Comm comm, const DM dm, ProblemData problem, U
     problem->apply_inflow_jacobian.qfunction_loc = StgShur14Inflow_Jacobian_loc;
     PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(stg_context, &problem->apply_inflow.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionContextReferenceCopy(stg_context, &problem->apply_inflow_jacobian.qfunction_context));
-    problem->bc_from_ics = PETSC_TRUE;
+    problem->set_bc_from_ics = PETSC_TRUE;
   }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -307,6 +316,11 @@ PetscErrorCode SetupStrongStg(DM dm, SimpleBC bc, ProblemData problem, Physics p
       // {1,2,3,4} for u, v, w, T
       for (int i = 0; i < 4; i++) comps[i] = i + 1;
       break;
+
+    case STATEVAR_ENTROPY:
+      // {1,2,3,4}
+      for (int i = 0; i < 4; i++) comps[i] = i + 1;
+      break;
   }
 
   PetscCall(DMGetLabel(dm, "Face Sets", &label));
diff --git a/examples/fluids/problems/stg_shur14.h b/examples/fluids/problems/stg_shur14.h
index ea2087af28..49fd6f1f6b 100644
--- a/examples/fluids/problems/stg_shur14.h
+++ b/examples/fluids/problems/stg_shur14.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/problems/taylorgreen.c b/examples/fluids/problems/taylorgreen.c
index 9c090b593f..7a0e55f52c 100644
--- a/examples/fluids/problems/taylorgreen.c
+++ b/examples/fluids/problems/taylorgreen.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/pytorch_pkgconfig.py b/examples/fluids/pytorch_pkgconfig.py
new file mode 100644
index 0000000000..8cfdb96c32
--- /dev/null
+++ b/examples/fluids/pytorch_pkgconfig.py
@@ -0,0 +1,71 @@
+from pathlib import Path
+import torch
+import torch.utils.cpp_extension as C
+import torch.utils as tutils
+import re
+
+build_dir = Path('./build')
+if not build_dir.is_dir():
+    build_dir.mkdir()
+pkgconfig_path = build_dir / 'libtorch.pc'
+
+variables = {}
+keywords = {}
+
+
+def add_variable(file, variable, value):
+    file.write(f"{variable}={value}\n")
+
+
+def add_keyword(file, key, value):
+    file.write(f"{key}: {value}\n")
+
+
+variables['prefix'] = Path(C.library_paths()[0]).parent.as_posix()
+
+keywords['Name'] = 'libTorch'
+keywords['Description'] = 'Custom made PC for PyTorch'
+keywords['Version'] = torch.__version__
+
+keywords['Cflags'] = ''
+for include_path in C.include_paths():
+    keywords['Cflags'] += f'-I{include_path} '
+
+# Need to search the CMake file to see whether the library was compiled with the CXX11 ABI standard
+regex_ABI = re.compile(r'"(\S*GLIBCXX_USE_CXX11_ABI\S*)"')
+torchCMakePath = Path(tutils.cmake_prefix_path) / 'Torch/TorchConfig.cmake'
+abi_flag = ''
+with torchCMakePath.open('r') as f:
+    for line in f:
+        regex_result = regex_ABI.search(line)
+        if regex_result:
+            abi_flag = regex_result[1]
+
+keywords['Cflags'] += abi_flag
+
+keywords['Libs'] = ''
+for lib_path in C.library_paths():
+    keywords['Libs'] += f'-L{lib_path} '
+keywords['Libs'] += '-lc10 -ltorch_cpu '
+if torch.cuda.is_available():
+    keywords['Libs'] += '-lc10_cuda -ltorch_cuda '
+    # Need to force linking with libtorch_cuda.so, so find path and specify linking flag to force it
+    # This flag might be of limited portability
+    for lib_path in C.library_paths():
+        torch_cuda_path = Path(lib_path) / 'libtorch_cuda.so'
+        if torch_cuda_path.exists():
+            variables['torch_cuda_path'] = torch_cuda_path.as_posix()
+            keywords['Libs'] += f'-Wl,--no-as-needed,"{torch_cuda_path.as_posix()}" '
+keywords['Libs'] += '-ltorch '
+keywords['Libs.private'] = ''
+
+with pkgconfig_path.open('w') as file:
+    for variable, value in variables.items():
+        add_variable(file, variable, value)
+
+    file.write('\n')
+
+    for keyword, value in keywords.items():
+        add_keyword(file, keyword, value)
+
+print(pkgconfig_path.absolute())
diff --git a/examples/fluids/qfunctions/advection.h b/examples/fluids/qfunctions/advection.h
index 43b5293837..486e0727ed 100644
--- a/examples/fluids/qfunctions/advection.h
+++ b/examples/fluids/qfunctions/advection.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,11 @@
 
 /// @file
 /// Advection initial condition and operator for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#include <stdbool.h>
+#endif
 
 #include "advection_types.h"
 #include "newtonian_state.h"
diff --git a/examples/fluids/qfunctions/advection_types.h b/examples/fluids/qfunctions/advection_types.h
index 838995191c..ed008f0603 100644
--- a/examples/fluids/qfunctions/advection_types.h
+++ b/examples/fluids/qfunctions/advection_types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -6,7 +6,11 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <stdbool.h>
+#endif
+
 #include "stabilization_types.h"
 
 typedef enum {
diff --git a/examples/fluids/qfunctions/bc_freestream.h b/examples/fluids/qfunctions/bc_freestream.h
index 5fb4da2289..c348e9ab2e 100644
--- a/examples/fluids/qfunctions/bc_freestream.h
+++ b/examples/fluids/qfunctions/bc_freestream.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,6 +7,10 @@
 
 /// @file
 /// QFunctions for the `bc_freestream` and `bc_outflow` boundary conditions
+#ifndef CEED_RUNNING_JIT_PASS
+#include <stdbool.h>
+#endif
+
 #include "bc_freestream_type.h"
 #include "newtonian_state.h"
 #include "newtonian_types.h"
@@ -30,17 +34,17 @@ CEED_QFUNCTION_HELPER int Freestream(void *ctx, CeedInt Q, const CeedScalar *con
     const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
     const State      s     = StateFromQ(newt_ctx, qi, state_var);
 
-    CeedScalar wdetJb, norm[3];
-    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, NULL, norm);
+    CeedScalar wdetJb, normal[3];
+    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, NULL, normal);
     wdetJb *= is_implicit ? -1. : 1.;
 
     StateConservative flux;
     switch (flux_type) {
       case RIEMANN_HLL:
-        flux = RiemannFlux_HLL(newt_ctx, s, context->S_infty, norm);
+        flux = RiemannFlux_HLL(newt_ctx, s, context->S_infty, normal);
         break;
       case RIEMANN_HLLC:
-        flux = RiemannFlux_HLLC(newt_ctx, s, context->S_infty, norm);
+        flux = RiemannFlux_HLLC(newt_ctx, s, context->S_infty, normal);
         break;
     }
     CeedScalar Flux[5];
@@ -64,6 +68,10 @@ CEED_QFUNCTION(Freestream_Prim_HLL)(void *ctx, CeedInt Q, const CeedScalar *cons
   return Freestream(ctx, Q, in, out, STATEVAR_PRIMITIVE, RIEMANN_HLL);
 }
 
+CEED_QFUNCTION(Freestream_Entropy_HLL)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return Freestream(ctx, Q, in, out, STATEVAR_ENTROPY, RIEMANN_HLL);
+}
+
 CEED_QFUNCTION(Freestream_Conserv_HLLC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return Freestream(ctx, Q, in, out, STATEVAR_CONSERVATIVE, RIEMANN_HLLC);
 }
@@ -72,6 +80,10 @@ CEED_QFUNCTION(Freestream_Prim_HLLC)(void *ctx, CeedInt Q, const CeedScalar *con
   return Freestream(ctx, Q, in, out, STATEVAR_PRIMITIVE, RIEMANN_HLLC);
 }
 
+CEED_QFUNCTION(Freestream_Entropy_HLLC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return Freestream(ctx, Q, in, out, STATEVAR_ENTROPY, RIEMANN_HLLC);
+}
+
 CEED_QFUNCTION_HELPER int Freestream_Jacobian(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateVariable state_var,
                                               RiemannFluxType flux_type) {
   const CeedScalar(*dq)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
@@ -86,8 +98,8 @@ CEED_QFUNCTION_HELPER int Freestream_Jacobian(void *ctx, CeedInt Q, const CeedSc
   const State                    dS_infty    = {0};
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar wdetJb, norm[3];
-    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, NULL, norm);
+    CeedScalar wdetJb, normal[3];
+    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, NULL, normal);
     wdetJb *= is_implicit ? -1. : 1.;
 
     CeedScalar qi[5], dqi[5];
@@ -99,10 +111,10 @@ CEED_QFUNCTION_HELPER int Freestream_Jacobian(void *ctx, CeedInt Q, const CeedSc
     StateConservative dflux;
     switch (flux_type) {
       case RIEMANN_HLL:
-        dflux = RiemannFlux_HLL_fwd(newt_ctx, s, ds, context->S_infty, dS_infty, norm);
+        dflux = RiemannFlux_HLL_fwd(newt_ctx, s, ds, context->S_infty, dS_infty, normal);
         break;
       case RIEMANN_HLLC:
-        dflux = RiemannFlux_HLLC_fwd(newt_ctx, s, ds, context->S_infty, dS_infty, norm);
+        dflux = RiemannFlux_HLLC_fwd(newt_ctx, s, ds, context->S_infty, dS_infty, normal);
         break;
     }
     CeedScalar dFlux[5];
@@ -120,6 +132,10 @@ CEED_QFUNCTION(Freestream_Jacobian_Prim_HLL)(void *ctx, CeedInt Q, const CeedSca
   return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE, RIEMANN_HLL);
 }
 
+CEED_QFUNCTION(Freestream_Jacobian_Entropy_HLL)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY, RIEMANN_HLL);
+}
+
 CEED_QFUNCTION(Freestream_Jacobian_Conserv_HLLC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_CONSERVATIVE, RIEMANN_HLLC);
 }
@@ -128,6 +144,10 @@ CEED_QFUNCTION(Freestream_Jacobian_Prim_HLLC)(void *ctx, CeedInt Q, const CeedSc
   return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE, RIEMANN_HLLC);
 }
 
+CEED_QFUNCTION(Freestream_Jacobian_Entropy_HLLC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return Freestream_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY, RIEMANN_HLLC);
+}
+
 // Note the identity
 //
 // softplus(x) - x = log(1 + exp(x)) - x
@@ -166,8 +186,8 @@ CEED_QFUNCTION_HELPER int RiemannOutflow(void *ctx, CeedInt Q, const CeedScalar
   const bool                     is_implicit = gas->is_implicit;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar wdetJb, dXdx[2][3], norm[3];
-    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm);
+    CeedScalar wdetJb, dXdx[2][3], normal[3];
+    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, normal);
     wdetJb *= is_implicit ? -1. : 1.;
     const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
     const State      s_int = StateFromQ(gas, qi, state_var);
@@ -175,10 +195,10 @@ CEED_QFUNCTION_HELPER int RiemannOutflow(void *ctx, CeedInt Q, const CeedScalar
     StatePrimitive y_ext      = s_int.Y;
     y_ext.pressure            = outflow->pressure;
     y_ext.temperature         = outflow->temperature;
-    const CeedScalar u_normal = Dot3(y_ext.velocity, norm);
+    const CeedScalar u_normal = Dot3(y_ext.velocity, normal);
     const CeedScalar proj     = (1 - outflow->recirc) * Softplus(-u_normal, outflow->softplus_velocity);
     for (CeedInt j = 0; j < 3; j++) {
-      y_ext.velocity[j] += norm[j] * proj;  // (I - n n^T) projects into the plane tangent to the normal
+      y_ext.velocity[j] += normal[j] * proj;  // (I - n n^T) projects into the plane tangent to the normal
     }
     State s_ext = StateFromPrimitive(gas, y_ext);
 
@@ -191,10 +211,10 @@ CEED_QFUNCTION_HELPER int RiemannOutflow(void *ctx, CeedInt Q, const CeedScalar
     KMUnpack(kmstress, stress);
     ViscousEnergyFlux(gas, s_int.Y, grad_s, stress, Fe);
 
-    StateConservative F_inviscid_normal = RiemannFlux_HLLC(gas, s_int, s_ext, norm);
+    StateConservative F_inviscid_normal = RiemannFlux_HLLC(gas, s_int, s_ext, normal);
 
     CeedScalar Flux[5];
-    FluxTotal_RiemannBoundary(F_inviscid_normal, stress, Fe, norm, Flux);
+    FluxTotal_RiemannBoundary(F_inviscid_normal, stress, Fe, normal, Flux);
 
     for (CeedInt j = 0; j < 5; j++) v[j][i] = -wdetJb * Flux[j];
 
@@ -215,6 +235,10 @@ CEED_QFUNCTION(RiemannOutflow_Prim)(void *ctx, CeedInt Q, const CeedScalar *cons
   return RiemannOutflow(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(RiemannOutflow_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return RiemannOutflow(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // *****************************************************************************
 // Jacobian for Riemann pressure/temperature outflow boundary condition
 // *****************************************************************************
@@ -231,8 +255,8 @@ CEED_QFUNCTION_HELPER int RiemannOutflow_Jacobian(void *ctx, CeedInt Q, const Ce
   const bool                     is_implicit = gas->is_implicit;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar wdetJb, dXdx[2][3], norm[3];
-    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm);
+    CeedScalar wdetJb, dXdx[2][3], normal[3];
+    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, normal);
     wdetJb *= is_implicit ? -1. : 1.;
 
     CeedScalar qi[5], kmstress[6], dqi[5];
@@ -247,13 +271,13 @@ CEED_QFUNCTION_HELPER int RiemannOutflow_Jacobian(void *ctx, CeedInt Q, const Ce
     y_ext.temperature          = outflow->temperature;
     dy_ext.pressure            = 0;
     dy_ext.temperature         = 0;
-    const CeedScalar u_normal  = Dot3(s_int.Y.velocity, norm);
-    const CeedScalar du_normal = Dot3(ds_int.Y.velocity, norm);
+    const CeedScalar u_normal  = Dot3(s_int.Y.velocity, normal);
+    const CeedScalar du_normal = Dot3(ds_int.Y.velocity, normal);
     const CeedScalar proj      = (1 - outflow->recirc) * Softplus(-u_normal, outflow->softplus_velocity);
     const CeedScalar dproj     = (1 - outflow->recirc) * Softplus_fwd(-u_normal, -du_normal, outflow->softplus_velocity);
     for (CeedInt j = 0; j < 3; j++) {
-      y_ext.velocity[j] += norm[j] * proj;
-      dy_ext.velocity[j] += norm[j] * dproj;
+      y_ext.velocity[j] += normal[j] * proj;
+      dy_ext.velocity[j] += normal[j] * dproj;
     }
 
     State s_ext  = StateFromPrimitive(gas, y_ext);
@@ -269,10 +293,10 @@ CEED_QFUNCTION_HELPER int RiemannOutflow_Jacobian(void *ctx, CeedInt Q, const Ce
     KMUnpack(kmstress, stress);
     ViscousEnergyFlux_fwd(gas, s_int.Y, ds_int.Y, grad_ds, stress, dstress, dFe);
 
-    StateConservative dF_inviscid_normal = RiemannFlux_HLLC_fwd(gas, s_int, ds_int, s_ext, ds_ext, norm);
+    StateConservative dF_inviscid_normal = RiemannFlux_HLLC_fwd(gas, s_int, ds_int, s_ext, ds_ext, normal);
 
     CeedScalar dFlux[5];
-    FluxTotal_RiemannBoundary(dF_inviscid_normal, dstress, dFe, norm, dFlux);
+    FluxTotal_RiemannBoundary(dF_inviscid_normal, dstress, dFe, normal, dFlux);
 
     for (int j = 0; j < 5; j++) v[j][i] = -wdetJb * dFlux[j];
   }
@@ -287,6 +311,10 @@ CEED_QFUNCTION(RiemannOutflow_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedSca
   return RiemannOutflow_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(RiemannOutflow_Jacobian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return RiemannOutflow_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // *****************************************************************************
 // Outflow boundary condition, weakly setting a constant pressure. This is the
 // classic outflow condition used by PHASTA-C and retained largely for
@@ -310,8 +338,8 @@ CEED_QFUNCTION_HELPER int PressureOutflow(void *ctx, CeedInt Q, const CeedScalar
     State            s     = StateFromQ(gas, qi, state_var);
     s.Y.pressure           = outflow->pressure;
 
-    CeedScalar wdetJb, dXdx[2][3], norm[3];
-    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm);
+    CeedScalar wdetJb, dXdx[2][3], normal[3];
+    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, normal);
     wdetJb *= is_implicit ? -1. : 1.;
 
     State grad_s[3];
@@ -327,7 +355,7 @@ CEED_QFUNCTION_HELPER int PressureOutflow(void *ctx, CeedInt Q, const CeedScalar
     FluxInviscid(gas, s, F_inviscid);
 
     CeedScalar Flux[5];
-    FluxTotal_Boundary(F_inviscid, stress, Fe, norm, Flux);
+    FluxTotal_Boundary(F_inviscid, stress, Fe, normal, Flux);
 
     for (CeedInt j = 0; j < 5; j++) v[j][i] = -wdetJb * Flux[j];
 
@@ -348,6 +376,10 @@ CEED_QFUNCTION(PressureOutflow_Prim)(void *ctx, CeedInt Q, const CeedScalar *con
   return PressureOutflow(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(PressureOutflow_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return PressureOutflow(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // *****************************************************************************
 // Jacobian for weak-pressure outflow boundary condition
 // *****************************************************************************
@@ -364,8 +396,8 @@ CEED_QFUNCTION_HELPER int PressureOutflow_Jacobian(void *ctx, CeedInt Q, const C
   const bool                     is_implicit = gas->is_implicit;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar wdetJb, dXdx[2][3], norm[3];
-    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm);
+    CeedScalar wdetJb, dXdx[2][3], normal[3];
+    QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, normal);
     wdetJb *= is_implicit ? -1. : 1.;
 
     CeedScalar qi[5], kmstress[6], dqi[5];
@@ -392,7 +424,7 @@ CEED_QFUNCTION_HELPER int PressureOutflow_Jacobian(void *ctx, CeedInt Q, const C
     FluxInviscid_fwd(gas, s, ds, dF_inviscid);
 
     CeedScalar dFlux[5];
-    FluxTotal_Boundary(dF_inviscid, dstress, dFe, norm, dFlux);
+    FluxTotal_Boundary(dF_inviscid, dstress, dFe, normal, dFlux);
 
     for (int j = 0; j < 5; j++) v[j][i] = -wdetJb * dFlux[j];
   }
@@ -406,3 +438,7 @@ CEED_QFUNCTION(PressureOutflow_Jacobian_Conserv)(void *ctx, CeedInt Q, const Cee
 CEED_QFUNCTION(PressureOutflow_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return PressureOutflow_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
+
+CEED_QFUNCTION(PressureOutflow_Jacobian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return PressureOutflow_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
diff --git a/examples/fluids/qfunctions/bc_freestream_type.h b/examples/fluids/qfunctions/bc_freestream_type.h
index 8c30ca2915..62a3fa1c4c 100644
--- a/examples/fluids/qfunctions/bc_freestream_type.h
+++ b/examples/fluids/qfunctions/bc_freestream_type.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/bc_slip.h b/examples/fluids/qfunctions/bc_slip.h
index 816d4957ca..5a77f3727e 100644
--- a/examples/fluids/qfunctions/bc_slip.h
+++ b/examples/fluids/qfunctions/bc_slip.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -56,6 +56,10 @@ CEED_QFUNCTION(Slip_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, Cee
   return Slip(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(Slip_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return Slip(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 CEED_QFUNCTION_HELPER int Slip_Jacobian(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateVariable state_var) {
   const CeedScalar(*dq)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
   const CeedScalar(*q_data_sur)     = in[2];
@@ -104,3 +108,7 @@ CEED_QFUNCTION(Slip_Jacobian_Conserv)(void *ctx, CeedInt Q, const CeedScalar *co
 CEED_QFUNCTION(Slip_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return Slip_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
+
+CEED_QFUNCTION(Slip_Jacobian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return Slip_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
diff --git a/examples/fluids/qfunctions/blasius.h b/examples/fluids/qfunctions/blasius.h
index 52a7ff5614..20e4f4c72b 100644
--- a/examples/fluids/qfunctions/blasius.h
+++ b/examples/fluids/qfunctions/blasius.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,10 @@
 
 /// @file
 /// Operator for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <stdbool.h>
+#endif
 
 #include "newtonian_state.h"
 #include "newtonian_types.h"
@@ -17,13 +20,11 @@
 
 typedef struct BlasiusContext_ *BlasiusContext;
 struct BlasiusContext_ {
-  bool                             implicit;                              // !< Using implicit timesteping or not
-  bool                             weakT;                                 // !< flag to set Temperature weakly at inflow
-  CeedScalar                       delta0;                                // !< Boundary layer height at inflow
-  CeedScalar                       U_inf;                                 // !< Velocity at boundary layer edge
-  CeedScalar                       T_inf;                                 // !< Temperature at boundary layer edge
+  bool                             implicit;  // !< Using implicit timesteping or not
+  bool                             weakT;     // !< flag to set Temperature weakly at inflow
+  CeedScalar                       delta0;    // !< Boundary layer height at inflow
+  State                            S_infty;
   CeedScalar                       T_wall;                                // !< Temperature at the wall
-  CeedScalar                       P0;                                    // !< Pressure at outflow
   CeedScalar                       x_inflow;                              // !< Location of inflow in x
   CeedScalar                       n_cheb;                                // !< Number of Chebyshev terms
   CeedScalar                      *X;                                     // !< Chebyshev polynomial coordinate vector (CPU only)
@@ -39,7 +40,7 @@ struct BlasiusContext_ {
 CEED_QFUNCTION_HELPER void ChebyshevEval(int N, const double *Tf, double x, double eta_max, double *f) {
   double dX_deta     = 2 / eta_max;
   double table[4][3] = {
-  // Chebyshev polynomials T_0, T_1, T_2 of the first kind in (-1,1)
+      // Chebyshev polynomials T_0, T_1, T_2 of the first kind in (-1,1)
       {1, x, 2 * x * x - 1},
       {0, 1, 4 * x        },
       {0, 0, 4            },
@@ -72,25 +73,26 @@ CEED_QFUNCTION_HELPER void ChebyshevEval(int N, const double *Tf, double x, doub
 // *****************************************************************************
 State CEED_QFUNCTION_HELPER(BlasiusSolution)(const BlasiusContext blasius, const CeedScalar x[3], const CeedScalar x0, const CeedScalar x_inflow,
                                              const CeedScalar rho_infty, CeedScalar *t12) {
-  CeedInt    N     = blasius->n_cheb;
-  CeedScalar mu    = blasius->newtonian_ctx.mu;
-  CeedScalar nu    = mu / rho_infty;
-  CeedScalar eta   = x[1] * sqrt(blasius->U_inf / (nu * (x0 + x[0] - x_inflow)));
-  CeedScalar X     = 2 * (eta / blasius->eta_max) - 1.;
-  CeedScalar U_inf = blasius->U_inf;
-  CeedScalar Rd    = GasConstant(&blasius->newtonian_ctx);
+  CeedInt    N       = blasius->n_cheb;
+  CeedScalar mu      = blasius->newtonian_ctx.mu;
+  State      S_infty = blasius->S_infty;
+  CeedScalar nu      = mu / rho_infty;
+  CeedScalar U_infty = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity));
+  CeedScalar eta     = x[1] * sqrt(U_infty / (nu * (x0 + x[0] - x_inflow)));
+  CeedScalar X       = 2 * (eta / blasius->eta_max) - 1.;
+  CeedScalar Rd      = GasConstant(&blasius->newtonian_ctx);
 
   CeedScalar f[4], h[4];
   ChebyshevEval(N, blasius->Tf_cheb, X, blasius->eta_max, f);
   ChebyshevEval(N - 1, blasius->Th_cheb, X, blasius->eta_max, h);
 
-  *t12 = mu * U_inf * f[2] * sqrt(U_inf / (nu * (x0 + x[0] - x_inflow)));
+  *t12 = mu * U_infty * f[2] * sqrt(U_infty / (nu * (x0 + x[0] - x_inflow)));
 
   CeedScalar Y[5];
-  Y[1] = U_inf * f[1];
-  Y[2] = 0.5 * sqrt(nu * U_inf / (x0 + x[0] - x_inflow)) * (eta * f[1] - f[0]);
+  Y[1] = U_infty * f[1];
+  Y[2] = 0.5 * sqrt(nu * U_infty / (x0 + x[0] - x_inflow)) * (eta * f[1] - f[0]);
   Y[3] = 0.;
-  Y[4] = blasius->T_inf * h[0];
+  Y[4] = S_infty.Y.temperature * h[0];
   Y[0] = rho_infty / h[0] * Rd * Y[4];
   return StateFromY(&blasius->newtonian_ctx, Y);
 }
@@ -109,24 +111,17 @@ CEED_QFUNCTION(ICsBlasius)(void *ctx, CeedInt Q, const CeedScalar *const *in, Ce
   const CeedScalar               x_inflow = context->x_inflow;
   CeedScalar                     t12;
 
-  const CeedScalar Y_inf[5] = {context->P0, context->U_inf, 0, 0, context->T_inf};
-  const State      s_inf    = StateFromY(gas, Y_inf);
+  const State      S_infty = context->S_infty;
+  const CeedScalar U_infty = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity));
 
-  const CeedScalar x0 = context->U_inf * s_inf.U.density / (mu * 25 / Square(delta0));
+  const CeedScalar x0 = U_infty * S_infty.U.density / (mu * 25 / Square(delta0));
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     const CeedScalar x[3] = {X[0][i], X[1][i], X[2][i]};
-    State            s    = BlasiusSolution(context, x, x0, x_inflow, s_inf.U.density, &t12);
+    State            s    = BlasiusSolution(context, x, x0, x_inflow, S_infty.U.density, &t12);
     CeedScalar       q[5] = {0};
 
-    switch (gas->state_var) {
-      case STATEVAR_CONSERVATIVE:
-        UnpackState_U(s.U, q);
-        break;
-      case STATEVAR_PRIMITIVE:
-        UnpackState_Y(s.Y, q);
-        break;
-    }
+    StateToQ(gas, s, q, gas->state_var);
     for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j];
   }
   return 0;
@@ -143,8 +138,10 @@ CEED_QFUNCTION(Blasius_Inflow)(void *ctx, CeedInt Q, const CeedScalar *const *in
 
   const bool                     is_implicit = context->implicit;
   const NewtonianIdealGasContext gas         = &context->newtonian_ctx;
-  const CeedScalar               rho_0       = context->P0 / (GasConstant(gas) * context->T_inf);
-  const CeedScalar               x0          = context->U_inf * rho_0 / (gas->mu * 25 / Square(context->delta0));
+  State                          S_infty     = context->S_infty;
+  const CeedScalar               rho_0       = S_infty.U.density;
+  const CeedScalar               U_infty     = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity));
+  const CeedScalar               x0          = U_infty * rho_0 / (gas->mu * 25 / Square(context->delta0));
   const CeedScalar               zeros[11]   = {0.};
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
@@ -198,8 +195,10 @@ CEED_QFUNCTION(Blasius_Inflow_Jacobian)(void *ctx, CeedInt Q, const CeedScalar *
   const bool                     is_implicit = context->implicit;
   const CeedScalar               Rd          = GasConstant(gas);
   const CeedScalar               gamma       = HeatCapacityRatio(gas);
-  const CeedScalar               rho_0       = context->P0 / (Rd * context->T_inf);
-  const CeedScalar               x0          = context->U_inf * rho_0 / (gas->mu * 25 / (Square(context->delta0)));
+  const State                    S_infty     = context->S_infty;
+  const CeedScalar               rho_0       = S_infty.U.density;
+  const CeedScalar               U_infty     = sqrt(Dot3(S_infty.Y.velocity, S_infty.Y.velocity));
+  const CeedScalar               x0          = U_infty * rho_0 / (gas->mu * 25 / Square(context->delta0));
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     CeedScalar wdetJb, norm[3];
@@ -216,11 +215,12 @@ CEED_QFUNCTION(Blasius_Inflow_Jacobian)(void *ctx, CeedInt Q, const CeedScalar *
     if (context->weakT) {
       // rho should be from the current solution
       drho                   = dq[0][i];
-      CeedScalar dE_internal = drho * gas->cv * context->T_inf;
+      CeedScalar dE_internal = drho * gas->cv * S_infty.Y.temperature;
       CeedScalar dE_kinetic  = .5 * drho * Dot3(s.Y.velocity, s.Y.velocity);
       dE                     = dE_internal + dE_kinetic;
-      dP                     = drho * Rd * context->T_inf;  // interior rho with exterior T
-    } else {                                                // rho specified, E_internal from solution
+      dP                     = drho * Rd * S_infty.Y.temperature;  // interior rho with exterior T
+    } else {
+      // rho specified, E_internal from solution
       drho = 0;
       dE   = dq[4][i];
       dP   = dE * (gamma - 1.);
diff --git a/examples/fluids/qfunctions/channel.h b/examples/fluids/qfunctions/channel.h
index 7634696c74..9595c81701 100644
--- a/examples/fluids/qfunctions/channel.h
+++ b/examples/fluids/qfunctions/channel.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,11 @@
 
 /// @file
 /// Operator for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#include <stdbool.h>
+#endif
 
 #include "newtonian_state.h"
 #include "newtonian_types.h"
@@ -64,21 +67,14 @@ CEED_QFUNCTION(ICsChannel)(void *ctx, CeedInt Q, const CeedScalar *const *in, Ce
   const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
   CeedScalar(*q0)[CEED_Q_VLA]      = (CeedScalar(*)[CEED_Q_VLA])out[0];
 
-  const ChannelContext context = (ChannelContext)ctx;
+  const ChannelContext           context = (ChannelContext)ctx;
+  const NewtonianIdealGasContext gas     = &context->newtonian_ctx;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     const CeedScalar x[]  = {X[0][i], X[1][i], X[2][i]};
     State            s    = Exact_Channel(3, 0., x, 5, ctx);
     CeedScalar       q[5] = {0};
-    switch (context->newtonian_ctx.state_var) {
-      case STATEVAR_CONSERVATIVE:
-        UnpackState_U(s.U, q);
-        break;
-      case STATEVAR_PRIMITIVE:
-        UnpackState_Y(s.Y, q);
-        break;
-    }
-
+    StateToQ(gas, s, q, gas->state_var);
     for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j];
   }
   return 0;
diff --git a/examples/fluids/qfunctions/densitycurrent.h b/examples/fluids/qfunctions/densitycurrent.h
index 4d10261b4f..69fe9488fd 100644
--- a/examples/fluids/qfunctions/densitycurrent.h
+++ b/examples/fluids/qfunctions/densitycurrent.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -11,8 +11,10 @@
 // Model from:
 //   Semi-Implicit Formulations of the Navier-Stokes Equations: Application to
 //   Nonhydrostatic Atmospheric Modeling, Giraldo, Restelli, and Lauter (2010).
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #include "newtonian_state.h"
 #include "newtonian_types.h"
@@ -133,21 +135,14 @@ CEED_QFUNCTION(ICsDC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSca
   const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
   CeedScalar(*q0)[CEED_Q_VLA]      = (CeedScalar(*)[CEED_Q_VLA])out[0];
 
-  const DensityCurrentContext context = (DensityCurrentContext)ctx;
+  const DensityCurrentContext    context = (DensityCurrentContext)ctx;
+  const NewtonianIdealGasContext gas     = &context->newtonian_ctx;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     const CeedScalar x[]  = {X[0][i], X[1][i], X[2][i]};
     State            s    = Exact_DC(3, 0., x, 5, ctx);
     CeedScalar       q[5] = {0};
-    switch (context->newtonian_ctx.state_var) {
-      case STATEVAR_CONSERVATIVE:
-        UnpackState_U(s.U, q);
-        break;
-      case STATEVAR_PRIMITIVE:
-        UnpackState_Y(s.Y, q);
-        break;
-    }
-
+    StateToQ(gas, s, q, gas->state_var);
     for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j];
   }
   return 0;
diff --git a/examples/fluids/qfunctions/differential_filter.h b/examples/fluids/qfunctions/differential_filter.h
index 703bc2bfc8..a983cd7a63 100644
--- a/examples/fluids/qfunctions/differential_filter.h
+++ b/examples/fluids/qfunctions/differential_filter.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,10 @@
 //
 /// @file
 /// Implementation of differential filtering
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <stdbool.h>
+#endif
 
 #include "differential_filter_enums.h"
 #include "newtonian_state.h"
@@ -64,6 +67,10 @@ CEED_QFUNCTION(DifferentialFilter_RHS_Prim)(void *ctx, CeedInt Q, const CeedScal
   return DifferentialFilter_RHS(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(DifferentialFilter_RHS_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return DifferentialFilter_RHS(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 CEED_QFUNCTION_HELPER CeedScalar VanDriestWallDamping(const CeedScalar wall_dist_plus, const CeedScalar A_plus) {
   return -expm1(-wall_dist_plus / A_plus);
 }
diff --git a/examples/fluids/qfunctions/differential_filter_enums.h b/examples/fluids/qfunctions/differential_filter_enums.h
index ffa548fff6..9c000c3b9d 100644
--- a/examples/fluids/qfunctions/differential_filter_enums.h
+++ b/examples/fluids/qfunctions/differential_filter_enums.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/eulervortex.h b/examples/fluids/qfunctions/eulervortex.h
index 308cb50cea..2f6c0ad003 100644
--- a/examples/fluids/qfunctions/eulervortex.h
+++ b/examples/fluids/qfunctions/eulervortex.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -11,8 +11,11 @@
 
 // Model from:
 //   On the Order of Accuracy and Numerical Performance of Two Classes of Finite Volume WENO Schemes, Zhang, Zhang, and Shu (2011).
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#include <stdbool.h>
+#endif
 
 #include "utils.h"
 
diff --git a/examples/fluids/qfunctions/gaussianwave.h b/examples/fluids/qfunctions/gaussianwave.h
index 4115d86a81..0bf6b612b4 100644
--- a/examples/fluids/qfunctions/gaussianwave.h
+++ b/examples/fluids/qfunctions/gaussianwave.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,10 @@
 
 /// @file
 /// Thermodynamic wave propogation for testing freestream/non-reflecting boundary conditions. Proposed in Mengaldo et. al. 2014
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #include "newtonian_state.h"
 #include "utils.h"
@@ -69,3 +71,7 @@ CEED_QFUNCTION(IC_GaussianWave_Conserv)(void *ctx, CeedInt Q, const CeedScalar *
 CEED_QFUNCTION(IC_GaussianWave_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return IC_GaussianWave(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
+
+CEED_QFUNCTION(IC_GaussianWave_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return IC_GaussianWave(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
diff --git a/examples/fluids/qfunctions/grid_anisotropy_tensor.h b/examples/fluids/qfunctions/grid_anisotropy_tensor.h
index ef59a54c6d..80078afcd4 100644
--- a/examples/fluids/qfunctions/grid_anisotropy_tensor.h
+++ b/examples/fluids/qfunctions/grid_anisotropy_tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,7 +8,7 @@
 /// @file
 /// Element anisotropy tensor, as defined in 'Invariant data-driven subgrid stress modeling in the strain-rate eigenframe for large eddy simulation'
 /// Prakash et al. 2022
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "utils.h"
 #include "utils_eigensolver_jacobi.h"
diff --git a/examples/fluids/qfunctions/inverse_multiplicity.h b/examples/fluids/qfunctions/inverse_multiplicity.h
index c51fc0586b..2c4a5ef335 100644
--- a/examples/fluids/qfunctions/inverse_multiplicity.h
+++ b/examples/fluids/qfunctions/inverse_multiplicity.h
@@ -1,10 +1,10 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
-#include <ceed.h>
+#include <ceed/types.h>
 
 // @brief Calculate the inverse of the multiplicity, reducing to a single component
 CEED_QFUNCTION(InverseMultiplicity)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/examples/fluids/qfunctions/mass.h b/examples/fluids/qfunctions/mass.h
index 1147a2bb31..81de13d16e 100644
--- a/examples/fluids/qfunctions/mass.h
+++ b/examples/fluids/qfunctions/mass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,10 @@
 
 /// @file
 /// Mass operator for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // *****************************************************************************
 // This QFunction applies the mass matrix to five interlaced fields.
diff --git a/examples/fluids/qfunctions/newtonian.h b/examples/fluids/qfunctions/newtonian.h
index 1068519217..fa470e5cd1 100644
--- a/examples/fluids/qfunctions/newtonian.h
+++ b/examples/fluids/qfunctions/newtonian.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,9 +7,11 @@
 
 /// @file
 /// Operator for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
 #include <stdlib.h>
+#endif
 
 #include "newtonian_state.h"
 #include "newtonian_types.h"
@@ -43,11 +45,16 @@ CEED_QFUNCTION_HELPER int ICsNewtonianIG(void *ctx, CeedInt Q, const CeedScalar
   return 0;
 }
 
+CEED_QFUNCTION(ICsNewtonianIG_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return ICsNewtonianIG(ctx, Q, in, out, STATEVAR_CONSERVATIVE);
+}
+
 CEED_QFUNCTION(ICsNewtonianIG_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return ICsNewtonianIG(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
-CEED_QFUNCTION(ICsNewtonianIG_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ICsNewtonianIG(ctx, Q, in, out, STATEVAR_CONSERVATIVE);
+
+CEED_QFUNCTION(ICsNewtonianIG_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return ICsNewtonianIG(ctx, Q, in, out, STATEVAR_ENTROPY);
 }
 
 CEED_QFUNCTION_HELPER void MassFunction_Newtonian(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out,
@@ -143,15 +150,18 @@ CEED_QFUNCTION(RHSFunction_Newtonian)(void *ctx, CeedInt Q, const CeedScalar *co
   const CeedScalar(*q)[CEED_Q_VLA]   = (const CeedScalar(*)[CEED_Q_VLA])in[0];
   const CeedScalar(*Grad_q)          = in[1];
   const CeedScalar(*q_data)          = in[2];
+  const CeedScalar(*x)[CEED_Q_VLA]   = (const CeedScalar(*)[CEED_Q_VLA])in[3];
   CeedScalar(*v)[CEED_Q_VLA]         = (CeedScalar(*)[CEED_Q_VLA])out[0];
   CeedScalar(*Grad_v)[5][CEED_Q_VLA] = (CeedScalar(*)[5][CEED_Q_VLA])out[1];
 
   NewtonianIdealGasContext context = (NewtonianIdealGasContext)ctx;
   const CeedScalar        *g       = context->g;
   const CeedScalar         dt      = context->dt;
+  const CeedScalar         P0      = context->idl_pressure;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar U[5], wdetJ, dXdx[3][3];
+    CeedScalar       U[5], wdetJ, dXdx[3][3];
+    const CeedScalar x_i[3] = {x[0][i], x[1][i], x[2][i]};
     for (int j = 0; j < 5; j++) U[j] = q[j][i];
     QdataUnpack_3D(Q, i, q_data, &wdetJ, dXdx);
     State s = StateFromU(context, U);
@@ -179,6 +189,13 @@ CEED_QFUNCTION(RHSFunction_Newtonian)(void *ctx, CeedInt Q, const CeedScalar *co
     const CeedScalar body_force[5] = {0, s.U.density * g[0], s.U.density * g[1], s.U.density * g[2], Dot3(s.U.momentum, g)};
     for (int j = 0; j < 5; j++) v[j][i] = wdetJ * body_force[j];
 
+    if (context->idl_enable) {
+      const CeedScalar sigma         = LinearRampCoefficient(context->idl_amplitude, context->idl_length, context->idl_start, x_i[0]);
+      CeedScalar       damp_state[5] = {s.Y.pressure - P0, 0, 0, 0, 0}, idl_residual[5] = {0.};
+      InternalDampingLayer(context, s, sigma, damp_state, idl_residual);
+      for (int j = 0; j < 5; j++) v[j][i] -= wdetJ * idl_residual[j];
+    }
+
     // -- Stabilization method: none (Galerkin), SU, or SUPG
     CeedScalar Tau_d[3], stab[5][3], U_dot[5] = {0};
     Tau_diagPrim(context, s, dXdx, dt, Tau_d);
@@ -211,7 +228,7 @@ CEED_QFUNCTION_HELPER int IFunction_Newtonian(void *ctx, CeedInt Q, const CeedSc
   NewtonianIdealGasContext context = (NewtonianIdealGasContext)ctx;
   const CeedScalar        *g       = context->g;
   const CeedScalar         dt      = context->dt;
-  const CeedScalar         P0      = context->P0;
+  const CeedScalar         P0      = context->idl_pressure;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     const CeedScalar qi[5]  = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
@@ -282,6 +299,10 @@ CEED_QFUNCTION(IFunction_Newtonian_Prim)(void *ctx, CeedInt Q, const CeedScalar
   return IFunction_Newtonian(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(IFunction_Newtonian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return IFunction_Newtonian(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // *****************************************************************************
 // This QFunction implements the jacobian of the Navier-Stokes equations for implicit time stepping method.
 // *****************************************************************************
@@ -364,6 +385,10 @@ CEED_QFUNCTION(IJacobian_Newtonian_Prim)(void *ctx, CeedInt Q, const CeedScalar
   return IJacobian_Newtonian(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(IJacobian_Newtonian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return IJacobian_Newtonian(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // *****************************************************************************
 // Compute boundary integral (ie. for strongly set inflows)
 // *****************************************************************************
@@ -418,6 +443,10 @@ CEED_QFUNCTION(BoundaryIntegral_Prim)(void *ctx, CeedInt Q, const CeedScalar *co
   return BoundaryIntegral(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(BoundaryIntegral_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return BoundaryIntegral(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // *****************************************************************************
 // Jacobian for "set nothing" boundary integral
 // *****************************************************************************
@@ -473,3 +502,7 @@ CEED_QFUNCTION(BoundaryIntegral_Jacobian_Conserv)(void *ctx, CeedInt Q, const Ce
 CEED_QFUNCTION(BoundaryIntegral_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return BoundaryIntegral_Jacobian(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
+
+CEED_QFUNCTION(BoundaryIntegral_Jacobian_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return BoundaryIntegral_Jacobian(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
diff --git a/examples/fluids/qfunctions/newtonian_state.h b/examples/fluids/qfunctions/newtonian_state.h
index 185caf06d6..0b6796f2fc 100644
--- a/examples/fluids/qfunctions/newtonian_state.h
+++ b/examples/fluids/qfunctions/newtonian_state.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -9,8 +9,10 @@
 /// Structs and helper functions regarding the state of a newtonian simulation
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #include "newtonian_types.h"
 #include "utils.h"
@@ -38,6 +40,12 @@ CEED_QFUNCTION_HELPER void UnpackState_Y(StatePrimitive s, CeedScalar Y[5]) {
   Y[4] = s.temperature;
 }
 
+CEED_QFUNCTION_HELPER void UnpackState_V(StateEntropy s, CeedScalar V[5]) {
+  V[0] = s.S_density;
+  for (int i = 0; i < 3; i++) V[i + 1] = s.S_momentum[i];
+  V[4] = s.S_energy;
+}
+
 CEED_QFUNCTION_HELPER CeedScalar HeatCapacityRatio(NewtonianIdealGasContext gas) { return gas->cp / gas->cv; }
 
 CEED_QFUNCTION_HELPER CeedScalar GasConstant(NewtonianIdealGasContext gas) { return gas->cp - gas->cv; }
@@ -49,14 +57,12 @@ CEED_QFUNCTION_HELPER CeedScalar SoundSpeed(NewtonianIdealGasContext gas, CeedSc
 CEED_QFUNCTION_HELPER CeedScalar Mach(NewtonianIdealGasContext gas, CeedScalar T, CeedScalar u) { return u / SoundSpeed(gas, T); }
 
 CEED_QFUNCTION_HELPER CeedScalar TotalSpecificEnthalpy(NewtonianIdealGasContext gas, const State s) {
-  // Ignoring potential energy
-  CeedScalar e_internal = gas->cv * s.Y.temperature;
   CeedScalar e_kinetic  = 0.5 * Dot3(s.Y.velocity, s.Y.velocity);
+  CeedScalar e_internal = gas->cv * s.Y.temperature;
   return e_internal + e_kinetic + s.Y.pressure / s.U.density;
 }
 
 CEED_QFUNCTION_HELPER CeedScalar TotalSpecificEnthalpy_fwd(NewtonianIdealGasContext gas, const State s, const State ds) {
-  // Ignoring potential energy
   CeedScalar de_kinetic  = Dot3(ds.Y.velocity, s.Y.velocity);
   CeedScalar de_internal = gas->cv * ds.Y.temperature;
   return de_internal + de_kinetic + ds.Y.pressure / s.U.density - s.Y.pressure / Square(s.U.density) * ds.U.density;
@@ -89,6 +95,63 @@ CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromConservative_fwd(Newtonia
   return dY;
 }
 
+CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y) {
+  StateEntropy     V;
+  const CeedScalar gamma     = HeatCapacityRatio(gas);
+  const CeedScalar rho       = Y.pressure / (GasConstant(gas) * Y.temperature);
+  const CeedScalar entropy   = log(Y.pressure) - gamma * log(rho);
+  const CeedScalar rho_div_p = rho / Y.pressure;
+  const CeedScalar e_kinetic = 0.5 * Dot3(Y.velocity, Y.velocity);
+
+  V.S_density = (gamma - entropy) / (gamma - 1) - rho_div_p * e_kinetic;
+  for (int i = 0; i < 3; i++) V.S_momentum[i] = rho_div_p * Y.velocity[i];
+  V.S_energy = -rho_div_p;
+  return V;
+}
+
+CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromPrimitive_fwd(NewtonianIdealGasContext gas, State s, StatePrimitive dY) {
+  StateEntropy     dV;
+  const CeedScalar gamma = HeatCapacityRatio(gas);
+  CeedScalar       drho  = (dY.pressure * s.Y.temperature - s.Y.pressure * dY.temperature) / (GasConstant(gas) * s.Y.temperature * s.Y.temperature);
+
+  const CeedScalar e_kinetic  = .5 * Dot3(s.Y.velocity, s.Y.velocity);
+  const CeedScalar de_kinetic = Dot3(dY.velocity, s.Y.velocity);
+  const CeedScalar rho_div_p  = s.U.density / s.Y.pressure;
+  const CeedScalar drho_div_p = (drho * s.Y.pressure - s.U.density * dY.pressure) / Square(s.Y.pressure);
+
+  CeedScalar dentropy = dY.pressure / s.Y.pressure - gamma * drho / s.U.density;
+
+  dV.S_density = -dentropy / (gamma - 1) - de_kinetic * rho_div_p - e_kinetic * drho_div_p;
+  for (CeedInt i = 0; i < 3; i++) dV.S_momentum[i] = rho_div_p * dY.velocity[i] + drho_div_p * s.Y.velocity[i];
+  dV.S_energy = -drho_div_p;
+  return dV;
+}
+
+CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromEntropy(NewtonianIdealGasContext gas, StateEntropy V) {
+  StatePrimitive Y;
+  for (int i = 0; i < 3; i++) Y.velocity[i] = -V.S_momentum[i] / V.S_energy;
+  Y.temperature              = -1 / (GasConstant(gas) * V.S_energy);
+  const CeedScalar gamma     = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic = 0.5 * Dot3(Y.velocity, Y.velocity);
+  const CeedScalar entropy   = gamma - (gamma - 1) * (V.S_density - e_kinetic * V.S_energy);
+  const CeedScalar log_P     = -(entropy + gamma * log(-V.S_energy)) / (gamma - 1);
+  Y.pressure                 = exp(log_P);
+  return Y;
+}
+
+CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromEntropy_fwd(NewtonianIdealGasContext gas, State s, StateEntropy dV) {
+  StatePrimitive dY;
+  StateEntropy   V = StateEntropyFromPrimitive(gas, s.Y);
+  for (int i = 0; i < 3; i++) dY.velocity[i] = -(dV.S_momentum[i] - V.S_momentum[i] * dV.S_energy / V.S_energy) / V.S_energy;
+  dY.temperature              = dV.S_energy / (GasConstant(gas) * V.S_energy * V.S_energy);
+  const CeedScalar gamma      = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic  = 0.5 * Dot3(s.Y.velocity, s.Y.velocity);
+  const CeedScalar de_kinetic = Dot3(dY.velocity, s.Y.velocity);
+  const CeedScalar dentropy   = (1 - gamma) * (dV.S_density - e_kinetic * dV.S_energy - de_kinetic * V.S_energy);
+  dY.pressure                 = s.Y.pressure * (-dentropy - gamma * dV.S_energy / V.S_energy) / (gamma - 1);
+  return dY;
+}
+
 CEED_QFUNCTION_HELPER StateConservative StateConservativeFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y) {
   StateConservative U;
   U.density = Y.pressure / (GasConstant(gas) * Y.temperature);
@@ -116,6 +179,77 @@ CEED_QFUNCTION_HELPER StateConservative StateConservativeFromPrimitive_fwd(Newto
   return dU;
 }
 
+CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromConservative(NewtonianIdealGasContext gas, StateConservative U) {
+  StateEntropy     V;
+  const CeedScalar gamma      = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic  = .5 * Dot3(U.momentum, U.momentum) / U.density;
+  const CeedScalar e_internal = U.E_total - e_kinetic;
+  const CeedScalar p          = (gamma - 1) * e_internal;
+  const CeedScalar entropy    = log(p) - gamma * log(U.density);
+
+  V.S_density = (gamma - entropy) / (gamma - 1) - e_kinetic / p;
+  for (int i = 0; i < 3; i++) V.S_momentum[i] = U.momentum[i] / p;
+  V.S_energy = -U.density / p;
+  return V;
+}
+
+CEED_QFUNCTION_HELPER StateEntropy StateEntropyFromConservative_fwd(NewtonianIdealGasContext gas, State s, StateConservative dU) {
+  StateEntropy     dV;
+  const CeedScalar gamma       = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic   = .5 * Dot3(s.U.momentum, s.U.momentum) / s.U.density;
+  const CeedScalar de_kinetic  = (Dot3(s.U.momentum, dU.momentum) - e_kinetic * dU.density) / s.U.density;
+  const CeedScalar de_internal = dU.E_total - de_kinetic;
+  const CeedScalar p           = s.Y.pressure;
+  const CeedScalar dp          = (gamma - 1) * de_internal;
+
+  CeedScalar dentropy = dp / p - gamma * dU.density / s.U.density;
+
+  dV.S_density = -dentropy / (gamma - 1) - de_kinetic / p + dp * e_kinetic / Square(p);
+  for (CeedInt i = 0; i < 3; i++) {
+    dV.S_momentum[i] = (dU.momentum[i] - s.U.momentum[i] * dp / p) / p;
+  }
+  dV.S_energy = -(dU.density - s.U.density * dp / p) / p;
+  return dV;
+}
+
+CEED_QFUNCTION_HELPER StateConservative StateConservativeFromEntropy(NewtonianIdealGasContext gas, StateEntropy V) {
+  StateConservative U;
+  CeedScalar        velocity[3];
+  for (int i = 0; i < 3; i++) velocity[i] = -V.S_momentum[i] / V.S_energy;
+  const CeedScalar gamma     = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic = 0.5 * Dot3(velocity, velocity);
+  const CeedScalar entropy   = gamma - (gamma - 1) * (V.S_density - e_kinetic * V.S_energy);
+  const CeedScalar log_rho   = -(entropy + log(-V.S_energy)) / (gamma - 1);
+  U.density                  = exp(log_rho);
+  for (int i = 0; i < 3; i++) U.momentum[i] = U.density * velocity[i];
+
+  const CeedScalar e_internal = -gas->cv / (GasConstant(gas) * V.S_energy);
+  U.E_total                   = U.density * (e_internal + e_kinetic);
+  return U;
+}
+
+CEED_QFUNCTION_HELPER StateConservative StateConservativeFromEntropy_fwd(NewtonianIdealGasContext gas, State s, StateEntropy dV) {
+  StateConservative dU;
+  CeedScalar        dvelocity[3];
+  StateEntropy      V = StateEntropyFromPrimitive(gas, s.Y);
+  for (int i = 0; i < 3; i++) dvelocity[i] = (-dV.S_momentum[i] - s.Y.velocity[i] * dV.S_energy) / V.S_energy;
+  const CeedScalar gamma      = HeatCapacityRatio(gas);
+  const CeedScalar e_kinetic  = 0.5 * Dot3(s.Y.velocity, s.Y.velocity);
+  const CeedScalar de_kinetic = Dot3(dvelocity, s.Y.velocity);
+  const CeedScalar entropy    = gamma - (gamma - 1) * (V.S_density - e_kinetic * V.S_energy);
+  const CeedScalar dentropy   = -(gamma - 1) * (dV.S_density - (de_kinetic * V.S_energy + e_kinetic * dV.S_energy));
+  const CeedScalar log_rho    = -(entropy + log(-V.S_energy)) / (gamma - 1);
+  const CeedScalar rho        = exp(log_rho);
+  dU.density                  = -rho / (gamma - 1) * (dentropy + dV.S_energy / V.S_energy);
+  for (int i = 0; i < 3; i++) dU.momentum[i] = dU.density * s.Y.velocity[i] + s.U.density * dvelocity[i];
+
+  const CeedScalar e_internal  = -gas->cv / (GasConstant(gas) * V.S_energy);
+  const CeedScalar de_internal = gas->cv * dV.S_energy / (GasConstant(gas) * V.S_energy * V.S_energy);
+  const CeedScalar e_total     = e_internal + e_kinetic;
+  dU.E_total                   = dU.density * e_total + s.U.density * (de_internal + de_kinetic);
+  return dU;
+}
+
 CEED_QFUNCTION_HELPER State StateFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y) {
   StateConservative U = StateConservativeFromPrimitive(gas, Y);
   State             s;
@@ -156,6 +290,11 @@ CEED_QFUNCTION_HELPER void StateToU(NewtonianIdealGasContext gas, const State in
 
 CEED_QFUNCTION_HELPER void StateToY(NewtonianIdealGasContext gas, const State input, CeedScalar Y[5]) { UnpackState_Y(input.Y, Y); }
 
+CEED_QFUNCTION_HELPER void StateToV(NewtonianIdealGasContext gas, const State input, CeedScalar V[5]) {
+  StateEntropy state_V = StateEntropyFromPrimitive(gas, input.Y);
+  UnpackState_V(state_V, V);
+}
+
 CEED_QFUNCTION_HELPER void StateToQ(NewtonianIdealGasContext gas, const State input, CeedScalar Q[5], StateVariable state_var) {
   switch (state_var) {
     case STATEVAR_CONSERVATIVE:
@@ -164,6 +303,25 @@ CEED_QFUNCTION_HELPER void StateToQ(NewtonianIdealGasContext gas, const State in
     case STATEVAR_PRIMITIVE:
       StateToY(gas, input, Q);
       break;
+    case STATEVAR_ENTROPY:
+      StateToV(gas, input, Q);
+      break;
+  }
+}
+
+CEED_QFUNCTION_HELPER void StateToQ_fwd(NewtonianIdealGasContext gas, const State input, const State dinput, CeedScalar dQ[5],
+                                        StateVariable state_var) {
+  switch (state_var) {
+    case STATEVAR_CONSERVATIVE:
+    case STATEVAR_PRIMITIVE:
+      StateToQ(gas, dinput, dQ, state_var);
+      break;
+    case STATEVAR_ENTROPY: {
+      StateEntropy dstate_v;
+
+      dstate_v = StateEntropyFromPrimitive_fwd(gas, input, dinput.Y);
+      UnpackState_V(dstate_v, dQ);
+    } break;
   }
 }
 
@@ -211,6 +369,32 @@ CEED_QFUNCTION_HELPER State StateFromY_fwd(NewtonianIdealGasContext gas, State s
   return ds;
 }
 
+CEED_QFUNCTION_HELPER State StateFromV(NewtonianIdealGasContext gas, const CeedScalar V[5]) {
+  State        s;
+  StateEntropy state_V;
+  state_V.S_density     = V[0];
+  state_V.S_momentum[0] = V[1];
+  state_V.S_momentum[1] = V[2];
+  state_V.S_momentum[2] = V[3];
+  state_V.S_energy      = V[4];
+  s.U                   = StateConservativeFromEntropy(gas, state_V);
+  s.Y                   = StatePrimitiveFromEntropy(gas, state_V);
+  return s;
+}
+
+CEED_QFUNCTION_HELPER State StateFromV_fwd(NewtonianIdealGasContext gas, State s, const CeedScalar dV[5]) {
+  State        ds;
+  StateEntropy state_dV;
+  state_dV.S_density     = dV[0];
+  state_dV.S_momentum[0] = dV[1];
+  state_dV.S_momentum[1] = dV[2];
+  state_dV.S_momentum[2] = dV[3];
+  state_dV.S_energy      = dV[4];
+  ds.U                   = StateConservativeFromEntropy_fwd(gas, s, state_dV);
+  ds.Y                   = StatePrimitiveFromEntropy_fwd(gas, s, state_dV);
+  return ds;
+}
+
 CEED_QFUNCTION_HELPER State StateFromQ(NewtonianIdealGasContext gas, const CeedScalar Q[5], StateVariable state_var) {
   State s;
   switch (state_var) {
@@ -220,6 +404,9 @@ CEED_QFUNCTION_HELPER State StateFromQ(NewtonianIdealGasContext gas, const CeedS
     case STATEVAR_PRIMITIVE:
       s = StateFromY(gas, Q);
       break;
+    case STATEVAR_ENTROPY:
+      s = StateFromV(gas, Q);
+      break;
   }
   return s;
 }
@@ -233,6 +420,9 @@ CEED_QFUNCTION_HELPER State StateFromQ_fwd(NewtonianIdealGasContext gas, State s
     case STATEVAR_PRIMITIVE:
       ds = StateFromY_fwd(gas, s, dQ);
       break;
+    case STATEVAR_ENTROPY:
+      ds = StateFromV_fwd(gas, s, dQ);
+      break;
   }
   return ds;
 }
diff --git a/examples/fluids/qfunctions/newtonian_types.h b/examples/fluids/qfunctions/newtonian_types.h
index 3a5402c36d..70b2b4c3bd 100644
--- a/examples/fluids/qfunctions/newtonian_types.h
+++ b/examples/fluids/qfunctions/newtonian_types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -6,13 +6,17 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <stdbool.h>
+#endif
 
 #include "stabilization_types.h"
 
 typedef enum {
   STATEVAR_CONSERVATIVE = 0,
   STATEVAR_PRIMITIVE    = 1,
+  STATEVAR_ENTROPY      = 2,
 } StateVariable;
 
 typedef struct NewtonianIdealGasContext_ *NewtonianIdealGasContext;
@@ -32,11 +36,11 @@ struct NewtonianIdealGasContext_ {
   CeedScalar        dt;
   CeedScalar        time;
   CeedScalar        ijacobian_time_shift;
-  CeedScalar        P0;
   bool              is_implicit;
   StateVariable     state_var;
   StabilizationType stabilization;
   bool              idl_enable;
+  CeedScalar        idl_pressure;
   CeedScalar        idl_amplitude;
   CeedScalar        idl_start;
   CeedScalar        idl_length;
@@ -48,6 +52,12 @@ typedef struct {
   CeedScalar temperature;
 } StatePrimitive;
 
+typedef struct {
+  CeedScalar S_density;
+  CeedScalar S_momentum[3];
+  CeedScalar S_energy;
+} StateEntropy;
+
 typedef struct SetupContext_ *SetupContext;
 struct SetupContext_ {
   StatePrimitive                   reference;
diff --git a/examples/fluids/qfunctions/riemann_solver.h b/examples/fluids/qfunctions/riemann_solver.h
index 8ab0570504..7d884e9ad1 100644
--- a/examples/fluids/qfunctions/riemann_solver.h
+++ b/examples/fluids/qfunctions/riemann_solver.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -84,13 +84,11 @@ CEED_QFUNCTION_HELPER StateConservative Flux_HLL_fwd(State left, State right, St
   UnpackState_U(dflux_left, dF_l);
   UnpackState_U(dflux_right, dF_r);
   for (int i = 0; i < 5; i++) {
-    const CeedScalar U_diff      = U_r[i] - U_l[i];
-    const CeedScalar S_diff      = S_r - S_l;
-    const CeedScalar F_hll_denom = S_r * F_l[i] - S_l * F_r[i] + S_l * S_r * U_diff;
+    const CeedScalar S_diff = S_r - S_l;
 
-    dF_hll[i] += ((F_l[i] + S_r * U_diff) * S_diff - F_hll_denom) / Square(S_diff) * dS_r;
-    dF_hll[i] += ((-F_r[i] + S_r * U_diff) * S_diff + F_hll_denom) / Square(S_diff) * dS_l;
-    dF_hll[i] += (S_r * dF_l[i] - S_l * dF_r[i] + S_r * S_l * dU_r[i] - S_r * S_l * dU_l[i]) / S_diff;
+    dF_hll[i] += (S_l * (-F_l[i] + F_r[i] + S_l * U_l[i] - S_l * U_r[i]) / Square(S_diff)) * dS_r;
+    dF_hll[i] += (S_r * (F_l[i] - F_r[i] - S_r * U_l[i] + S_r * U_r[i]) / Square(S_diff)) * dS_l;
+    dF_hll[i] += (S_r * dF_l[i] - S_l * dF_r[i] + S_r * S_l * (dU_r[i] - dU_l[i])) / S_diff;
   }
   StateConservative dF = {
       dF_hll[0],
@@ -110,7 +108,6 @@ CEED_QFUNCTION_HELPER void ComputeHLLSpeeds_Roe(NewtonianIdealGasContext gas, St
   // Stability requires that these speed estimates are *at least* as fast as the physical wave speeds.
   CeedScalar u_roe = RoeAverage(r, u_left, u_right);
 
-  // TODO: revisit this for gravity
   CeedScalar H_left  = TotalSpecificEnthalpy(gas, left);
   CeedScalar H_right = TotalSpecificEnthalpy(gas, right);
   CeedScalar H_roe   = RoeAverage(r, H_left, H_right);
@@ -142,7 +139,8 @@ CEED_QFUNCTION_HELPER void ComputeHLLSpeeds_Roe_fwd(NewtonianIdealGasContext gas
   CeedScalar H_roe  = RoeAverage(r, H_left, H_right);
   CeedScalar dH_roe = RoeAverage_fwd(r, dr, H_left, H_right, dH_left, dH_right);
   CeedScalar a_roe  = sqrt((gamma - 1) * (H_roe - 0.5 * Square(u_roe)));
-  CeedScalar da_roe = 0.5 * (gamma - 1) / sqrt(H_roe) * dH_roe - 0.5 * sqrt(gamma - 1) * u_roe / sqrt(H_roe - 0.5 * Square(u_roe)) * du_roe;
+  CeedScalar da_roe = 0.5 * sqrt((gamma - 1) / (H_roe - 0.5 * Square(u_roe))) * dH_roe;  // (da/dH) dH
+  da_roe -= 0.5 * sqrt(gamma - 1) * u_roe / sqrt(H_roe - 0.5 * Square(u_roe)) * du_roe;  // (da/du) du
 
   *s_left   = u_roe - a_roe;
   *ds_left  = du_roe - da_roe;
diff --git a/examples/fluids/qfunctions/setupgeo.h b/examples/fluids/qfunctions/setupgeo.h
index a4d5181ad7..62b8390376 100644
--- a/examples/fluids/qfunctions/setupgeo.h
+++ b/examples/fluids/qfunctions/setupgeo.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,10 @@
 
 /// @file
 /// Geometric factors (3D) for Navier-Stokes example using PETSc
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #include "setupgeo_helpers.h"
 #include "utils.h"
diff --git a/examples/fluids/qfunctions/setupgeo2d.h b/examples/fluids/qfunctions/setupgeo2d.h
index c01753b2c1..0cd5649296 100644
--- a/examples/fluids/qfunctions/setupgeo2d.h
+++ b/examples/fluids/qfunctions/setupgeo2d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,8 @@
 
 /// @file
 /// Geometric factors (2D) for Navier-Stokes example using PETSc
-#include <ceed.h>
-#include <math.h>
+#include <ceed/types.h>
+
 #include "setupgeo_helpers.h"
 #include "utils.h"
 
@@ -98,3 +98,57 @@ CEED_QFUNCTION(SetupBoundary2d)(void *ctx, CeedInt Q, const CeedScalar *const *i
   }
   return 0;
 }
+
+// *****************************************************************************
+// This QFunction sets up the geometric factor required for integration when reference coordinates are in 2D and the physical coordinates are in 3D
+//
+// Reference (parent) 2D coordinates: X
+// Physical (current) 3D coordinates: x
+// Change of coordinate matrix:
+//   dxdX_{i,j} = dx_i/dX_j (indicial notation) [3 * 2]
+// Inverse change of coordinate matrix:
+//   dXdx_{i,j} = dX_i/dx_j (indicial notation) [2 * 3]
+//
+// (J1,J2,J3) is given by the cross product of the columns of dxdX_{i,j}
+//
+// detJb is the magnitude of (J1,J2,J3)
+//
+// dXdx is calculated via Moore–Penrose inverse:
+//
+//   dX_i/dx_j = (dxdX^T dxdX)^(-1) dxdX
+//             = (dx_l/dX_i * dx_l/dX_k)^(-1) dx_j/dX_k
+//
+// All quadrature data is stored in 10 field vector of quadrature data.
+//
+// We require the determinant of the Jacobian to properly compute integrals of
+//   the form: int( u v )
+//
+// Stored: w detJb
+//   in q_data_sur[0]
+//
+// Normal vector = (J1,J2,J3) / detJb
+//
+// Stored: (J1,J2,J3) / detJb
+//
+// Stored: dXdx_{i,j}
+//   in q_data_sur[1:6] as
+//    [dXdx_11 dXdx_12 dXdx_13]
+//    [dXdx_21 dXdx_22 dXdx_23]
+// *****************************************************************************
+CEED_QFUNCTION(Setup2D_3Dcoords)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+  const CeedScalar(*w)                = in[1];
+  CeedScalar(*q_data_sur)             = out[0];
+
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+    CeedScalar detJb, normal[3], dXdx[2][3];
+
+    NormalVectorFromdxdX_3D(Q, i, J, normal, &detJb);
+    InvertBoundaryMappingJacobian_3D(Q, i, J, dXdx);
+    const CeedScalar wdetJ = w[i] * detJb;
+
+    StoredValuesPack(Q, i, 0, 1, &wdetJ, q_data_sur);
+    StoredValuesPack(Q, i, 1, 6, (const CeedScalar *)dXdx, q_data_sur);
+  }
+  return 0;
+}
diff --git a/examples/fluids/qfunctions/setupgeo_helpers.h b/examples/fluids/qfunctions/setupgeo_helpers.h
index 930ff7bb72..6677225f4f 100644
--- a/examples/fluids/qfunctions/setupgeo_helpers.h
+++ b/examples/fluids/qfunctions/setupgeo_helpers.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -9,8 +9,10 @@
 /// Geometric factors (3D) for Navier-Stokes example using PETSc
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #include "utils.h"
 
diff --git a/examples/fluids/qfunctions/sgs_dd_model.h b/examples/fluids/qfunctions/sgs_dd_model.h
deleted file mode 100644
index da1a8f7967..0000000000
--- a/examples/fluids/qfunctions/sgs_dd_model.h
+++ /dev/null
@@ -1,245 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Structs and helper functions to evaluate data-driven subgrid-stress modeling
-/// See 'Invariant data-driven subgrid stress modeling in the strain-rate eigenframe for large eddy simulation' 2022 and 'S-frame discrepancy
-/// correction models for data-informed Reynolds stress closure' 2022
-#include <ceed.h>
-
-#include "newtonian_state.h"
-#include "newtonian_types.h"
-#include "sgs_dd_utils.h"
-#include "utils.h"
-#include "utils_eigensolver_jacobi.h"
-
-typedef struct SgsDDContext_ *SgsDDContext;
-struct SgsDDContext_ {
-  CeedInt    num_inputs, num_outputs;
-  CeedInt    num_layers;
-  CeedInt    num_neurons;
-  CeedScalar alpha;
-
-  struct NewtonianIdealGasContext_ gas;
-  struct {
-    size_t bias1, bias2;
-    size_t weight1, weight2;
-    size_t out_scaling;
-  } offsets;
-  size_t     total_bytes;
-  CeedScalar data[1];
-};
-
-CEED_QFUNCTION_HELPER void LeakyReLU(CeedScalar *x, const CeedScalar alpha, const CeedInt N) {
-  for (CeedInt i = 0; i < N; i++) x[i] *= (x[i] < 0 ? alpha : 1.);
-}
-
-CEED_QFUNCTION_HELPER void DataDrivenInference(const CeedScalar *inputs, CeedScalar *outputs, SgsDDContext sgsdd_ctx) {
-  const CeedInt     num_neurons = sgsdd_ctx->num_neurons;
-  const CeedInt     num_inputs  = sgsdd_ctx->num_inputs;
-  const CeedInt     num_outputs = sgsdd_ctx->num_outputs;
-  const CeedScalar  alpha       = sgsdd_ctx->alpha;
-  const CeedScalar *bias1       = &sgsdd_ctx->data[sgsdd_ctx->offsets.bias1];
-  const CeedScalar *bias2       = &sgsdd_ctx->data[sgsdd_ctx->offsets.bias2];
-  const CeedScalar *weight1     = &sgsdd_ctx->data[sgsdd_ctx->offsets.weight1];
-  const CeedScalar *weight2     = &sgsdd_ctx->data[sgsdd_ctx->offsets.weight2];
-  CeedScalar        V[20]       = {0.};
-
-  CopyN(bias1, V, num_neurons);
-  MatVecNM(weight1, inputs, num_neurons, num_inputs, CEED_NOTRANSPOSE, V);
-  LeakyReLU(V, alpha, num_neurons);
-  CopyN(bias2, outputs, num_outputs);
-  MatVecNM(weight2, V, num_outputs, num_neurons, CEED_NOTRANSPOSE, outputs);
-}
-
-CEED_QFUNCTION_HELPER void ComputeSgsDD_Fused(const CeedScalar grad_velo_aniso[3][3], const CeedScalar km_A_ij[6], const CeedScalar delta,
-                                              const CeedScalar viscosity, CeedScalar kmsgs_stress[6], SgsDDContext sgsdd_ctx) {
-  CeedScalar inputs[6], grad_velo_magnitude, eigenvectors[3][3], sgs_sframe_sym[6] = {0.}, new_bounds[6][2];
-  // Copying new_bounds because Sycl online compiler doesn't like direct casting the pointer
-  CopyN(&sgsdd_ctx->data[sgsdd_ctx->offsets.out_scaling], (CeedScalar *)new_bounds, 12);
-
-  ComputeSgsDDInputs(grad_velo_aniso, km_A_ij, delta, viscosity, eigenvectors, inputs, &grad_velo_magnitude);
-  DataDrivenInference(inputs, sgs_sframe_sym, sgsdd_ctx);
-  ComputeSgsDDOutputs(sgs_sframe_sym, delta, eigenvectors, new_bounds, grad_velo_magnitude, kmsgs_stress);
-}
-
-// @brief Calculate subgrid stress at nodes using anisotropic data-driven model
-CEED_QFUNCTION_HELPER int ComputeSgsDDNodal_Fused(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out,
-                                                  StateVariable state_var) {
-  const CeedScalar(*q)[CEED_Q_VLA]            = (const CeedScalar(*)[CEED_Q_VLA])in[0];
-  const CeedScalar(*grad_velo)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2];
-  const CeedScalar(*A_ij_delta)[CEED_Q_VLA]   = (const CeedScalar(*)[CEED_Q_VLA])in[3];
-  const CeedScalar(*inv_multiplicity)         = (const CeedScalar(*))in[4];
-  CeedScalar(*v)[CEED_Q_VLA]                  = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  const SgsDDContext             sgsdd_ctx = (SgsDDContext)ctx;
-  const NewtonianIdealGasContext gas       = &sgsdd_ctx->gas;
-
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    const CeedScalar qi[5]                 = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
-    const CeedScalar grad_velo_aniso[3][3] = {
-        {grad_velo[0][0][i], grad_velo[0][1][i], grad_velo[0][2][i]},
-        {grad_velo[1][0][i], grad_velo[1][1][i], grad_velo[1][2][i]},
-        {grad_velo[2][0][i], grad_velo[2][1][i], grad_velo[2][2][i]}
-    };
-    const CeedScalar km_A_ij[6] = {A_ij_delta[0][i], A_ij_delta[1][i], A_ij_delta[2][i], A_ij_delta[3][i], A_ij_delta[4][i], A_ij_delta[5][i]};
-    const CeedScalar delta      = A_ij_delta[6][i];
-    const State      s          = StateFromQ(gas, qi, state_var);
-    CeedScalar       km_sgs[6];
-
-    ComputeSgsDD_Fused(grad_velo_aniso, km_A_ij, delta, gas->mu / s.U.density, km_sgs, sgsdd_ctx);
-
-    for (int j = 0; j < 6; j++) v[j][i] = inv_multiplicity[i] * km_sgs[j];
-  }
-  return 0;
-}
-
-CEED_QFUNCTION(ComputeSgsDDNodal_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ComputeSgsDDNodal_Fused(ctx, Q, in, out, STATEVAR_PRIMITIVE);
-}
-
-CEED_QFUNCTION(ComputeSgsDDNodal_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ComputeSgsDDNodal_Fused(ctx, Q, in, out, STATEVAR_CONSERVATIVE);
-}
-
-// @brief Calculate inputs to anisotropic data-driven model
-CEED_QFUNCTION_HELPER int ComputeSgsDDNodal_Sequential_Inputs(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out,
-                                                              StateVariable state_var) {
-  const CeedScalar(*q)[CEED_Q_VLA]            = (const CeedScalar(*)[CEED_Q_VLA])in[0];
-  const CeedScalar(*grad_velo)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1];
-  const CeedScalar(*A_ij_delta)[CEED_Q_VLA]   = (const CeedScalar(*)[CEED_Q_VLA])in[2];
-  const CeedScalar(*inv_multiplicity)         = (const CeedScalar(*))in[3];
-  CeedScalar(*eigenvectors_stored)            = out[0];
-  CeedScalar(*model_inputs)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[1];
-
-  const SgsDDContext             sgsdd_ctx = (SgsDDContext)ctx;
-  const NewtonianIdealGasContext gas       = &sgsdd_ctx->gas;
-
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    const CeedScalar qi[5]                 = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
-    const CeedScalar grad_velo_aniso[3][3] = {
-        {grad_velo[0][0][i], grad_velo[0][1][i], grad_velo[0][2][i]},
-        {grad_velo[1][0][i], grad_velo[1][1][i], grad_velo[1][2][i]},
-        {grad_velo[2][0][i], grad_velo[2][1][i], grad_velo[2][2][i]}
-    };
-    const CeedScalar km_A_ij[6] = {A_ij_delta[0][i], A_ij_delta[1][i], A_ij_delta[2][i], A_ij_delta[3][i], A_ij_delta[4][i], A_ij_delta[5][i]};
-    const CeedScalar delta      = A_ij_delta[6][i];
-    const State      s          = StateFromQ(gas, qi, state_var);
-
-    CeedScalar model_inputs_i[6], grad_velo_magnitude, eigenvectors[3][3];
-    ComputeSgsDDInputs(grad_velo_aniso, km_A_ij, delta, gas->mu / s.U.density, eigenvectors, model_inputs_i, &grad_velo_magnitude);
-
-    ScaleN(model_inputs_i, inv_multiplicity[i], 6);
-    StoredValuesPack(Q, i, 0, 6, model_inputs_i, (CeedScalar *)model_inputs);
-    StoredValuesPack(Q, i, 0, 9, (const CeedScalar *)eigenvectors, eigenvectors_stored);
-    StoredValuesPack(Q, i, 9, 1, &grad_velo_magnitude, eigenvectors_stored);
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inputs_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ComputeSgsDDNodal_Sequential_Inputs(ctx, Q, in, out, STATEVAR_PRIMITIVE);
-}
-
-CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inputs_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ComputeSgsDDNodal_Sequential_Inputs(ctx, Q, in, out, STATEVAR_CONSERVATIVE);
-}
-
-// @brief Runs inference on the data-driven model, used predominantsly for testing and validation
-CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Inference)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  const CeedScalar(*model_inputs)     = in[0];
-  const CeedScalar(*inv_multiplicity) = in[1];
-  CeedScalar(*model_outputs)          = out[0];
-
-  const SgsDDContext sgsdd_ctx = (SgsDDContext)ctx;
-
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar model_inputs_i[6], model_outputs_i[6];
-
-    StoredValuesUnpack(Q, i, 0, 6, (const CeedScalar *)model_inputs, model_inputs_i);
-    DataDrivenInference(model_inputs_i, model_outputs_i, sgsdd_ctx);
-    ScaleN(model_outputs_i, inv_multiplicity[i], 6);
-    StoredValuesPack(Q, i, 0, 6, model_outputs_i, model_outputs);
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-// @brief Calculates SGS from outputs of anisotropic data-driven model
-CEED_QFUNCTION(ComputeSgsDDNodal_Sequential_Outputs)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  const CeedScalar(*model_outputs)          = in[0];
-  const CeedScalar(*A_ij_delta)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-  const CeedScalar(*inv_multiplicity)       = (const CeedScalar(*))in[2];
-  const CeedScalar(*eigenvectors_stored)    = in[3];
-  CeedScalar(*kmsgs_stress)[CEED_Q_VLA]     = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  const SgsDDContext sgsdd_ctx = (SgsDDContext)ctx;
-  CeedScalar         new_bounds[6][2];
-  CopyN(&sgsdd_ctx->data[sgsdd_ctx->offsets.out_scaling], (CeedScalar *)new_bounds, 12);
-
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    CeedScalar       model_outputs_i[6];
-    const CeedScalar delta = A_ij_delta[6][i];
-
-    StoredValuesUnpack(Q, i, 0, 6, model_outputs, model_outputs_i);
-    CeedScalar grad_velo_magnitude, eigenvectors[3][3], kmsgs_stress_i[6];
-    StoredValuesUnpack(Q, i, 0, 9, eigenvectors_stored, (CeedScalar *)eigenvectors);
-    StoredValuesUnpack(Q, i, 9, 1, eigenvectors_stored, &grad_velo_magnitude);
-    ComputeSgsDDOutputs(model_outputs_i, delta, eigenvectors, new_bounds, grad_velo_magnitude, kmsgs_stress_i);
-
-    for (int j = 0; j < 6; j++) kmsgs_stress[j][i] = inv_multiplicity[i] * kmsgs_stress_i[j];
-  }
-  return CEED_ERROR_SUCCESS;
-}
-
-// @brief Adds subgrid stress to residual (during IFunction evaluation)
-CEED_QFUNCTION_HELPER int FluxSubgridStress(const StatePrimitive Y, const CeedScalar km_sgs[6], CeedScalar Flux[5][3]) {
-  CeedScalar sgs[3][3];
-
-  KMUnpack(km_sgs, sgs);
-  for (CeedInt j = 0; j < 3; j++) {
-    Flux[0][j] = 0.;
-    for (CeedInt k = 0; k < 3; k++) Flux[k + 1][j] = sgs[k][j];
-    Flux[4][j] = Y.velocity[0] * sgs[0][j] + Y.velocity[1] * sgs[1][j] + Y.velocity[2] * sgs[2][j];
-  }
-  return 0;
-}
-
-CEED_QFUNCTION_HELPER int IFunction_NodalSgs(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateVariable state_var) {
-  const CeedScalar(*q)[CEED_Q_VLA]      = (const CeedScalar(*)[CEED_Q_VLA])in[0];
-  const CeedScalar(*q_data)             = in[1];
-  const CeedScalar(*km_sgs)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
-  CeedScalar(*Grad_v)[5][CEED_Q_VLA]    = (CeedScalar(*)[5][CEED_Q_VLA])out[0];
-
-  NewtonianIdealGasContext gas = (NewtonianIdealGasContext)ctx;
-
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    const CeedScalar qi[5] = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
-    const State      s     = StateFromQ(gas, qi, state_var);
-
-    CeedScalar wdetJ, dXdx[3][3];
-    QdataUnpack_3D(Q, i, q_data, &wdetJ, dXdx);
-
-    CeedScalar       Flux[5][3];
-    const CeedScalar km_sgs_i[6] = {km_sgs[0][i], km_sgs[1][i], km_sgs[2][i], km_sgs[3][i], km_sgs[4][i], km_sgs[5][i]};
-    FluxSubgridStress(s.Y, km_sgs_i, Flux);
-
-    for (CeedInt k = 0; k < 3; k++) {
-      for (CeedInt j = 0; j < 5; j++) {
-        Grad_v[k][j][i] = -wdetJ * (dXdx[k][0] * Flux[j][0] + dXdx[k][1] * Flux[j][1] + dXdx[k][2] * Flux[j][2]);
-      }
-    }
-  }
-  return 0;
-}
-
-CEED_QFUNCTION(IFunction_NodalSgs_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return IFunction_NodalSgs(ctx, Q, in, out, STATEVAR_CONSERVATIVE);
-}
-
-CEED_QFUNCTION(IFunction_NodalSgs_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return IFunction_NodalSgs(ctx, Q, in, out, STATEVAR_PRIMITIVE);
-}
diff --git a/examples/fluids/qfunctions/sgs_dd_training.h b/examples/fluids/qfunctions/sgs_dd_training.h
deleted file mode 100644
index 803f959a1d..0000000000
--- a/examples/fluids/qfunctions/sgs_dd_training.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Structs and helper functions for training data-driven subgrid-stress models
-/// See 'Invariant data-driven subgrid stress modeling in the strain-rate eigenframe for large eddy simulation' 2022 and 'S-frame discrepancy
-/// correction models for data-informed Reynolds stress closure' 2022
-#include <ceed.h>
-
-#include "differential_filter_enums.h"
-#include "newtonian_state.h"
-#include "newtonian_types.h"
-#include "sgs_dd_utils.h"
-#include "utils.h"
-#include "utils_eigensolver_jacobi.h"
-
-typedef struct SGS_DD_TrainingContext_ *SGS_DDTrainingContext;
-struct SGS_DD_TrainingContext_ {
-  struct NewtonianIdealGasContext_ gas;
-};
-
-// @brief Calculate Data-Driven SGS model training data at nodes
-CEED_QFUNCTION_HELPER int ComputeSGS_DDAnisotropicTrainingDataNodal(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out,
-                                                                    StateVariable state_var) {
-  const CeedScalar(*q)[CEED_Q_VLA]            = (const CeedScalar(*)[CEED_Q_VLA])in[0];
-  const CeedScalar(*velo_prod)[CEED_Q_VLA]    = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-  const CeedScalar(*grad_velo)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2];
-  const CeedScalar(*A_ij_delta)[CEED_Q_VLA]   = (const CeedScalar(*)[CEED_Q_VLA])in[3];
-  const CeedScalar(*inv_multiplicity)         = (const CeedScalar(*))in[4];
-  CeedScalar(*v)[CEED_Q_VLA]                  = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  const SGS_DDTrainingContext    sgsdd_ctx = (SGS_DDTrainingContext)ctx;
-  const NewtonianIdealGasContext gas       = &sgsdd_ctx->gas;
-
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    const CeedScalar qi[5]                 = {q[0][i], q[1][i], q[2][i], q[3][i], q[4][i]};
-    const CeedScalar grad_velo_aniso[3][3] = {
-        {grad_velo[0][0][i], grad_velo[0][1][i], grad_velo[0][2][i]},
-        {grad_velo[1][0][i], grad_velo[1][1][i], grad_velo[1][2][i]},
-        {grad_velo[2][0][i], grad_velo[2][1][i], grad_velo[2][2][i]}
-    };
-    const CeedScalar km_A_ij[6] = {A_ij_delta[0][i], A_ij_delta[1][i], A_ij_delta[2][i], A_ij_delta[3][i], A_ij_delta[4][i], A_ij_delta[5][i]};
-    const CeedScalar delta      = A_ij_delta[6][i];
-    const State      s          = StateFromQ(gas, qi, state_var);
-    CeedScalar       inputs[6];
-    CeedScalar       eigenvectors[3][3], grad_velo_magnitude;  // dummy variables, don't actually use them
-
-    ComputeSgsDDInputs(grad_velo_aniso, km_A_ij, delta, gas->mu / s.U.density, eigenvectors, inputs, &grad_velo_magnitude);
-
-    for (int j = 0; j < 6; j++) v[j][i] = inv_multiplicity[i] * inputs[j];
-
-    v[0 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_XX][i] - Square(s.Y.velocity[0])) * inv_multiplicity[i];
-    v[1 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_YY][i] - Square(s.Y.velocity[1])) * inv_multiplicity[i];
-    v[2 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_ZZ][i] - Square(s.Y.velocity[2])) * inv_multiplicity[i];
-    v[3 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_YZ][i] - s.Y.velocity[1] * s.Y.velocity[2]) * inv_multiplicity[i];
-    v[4 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_XZ][i] - s.Y.velocity[0] * s.Y.velocity[2]) * inv_multiplicity[i];
-    v[5 + 6][i] = (velo_prod[DIFF_FILTER_VELOCITY_SQUARED_XY][i] - s.Y.velocity[0] * s.Y.velocity[1]) * inv_multiplicity[i];
-  }
-  return 0;
-}
-
-CEED_QFUNCTION(ComputeSGS_DDAnisotropicTrainingDataNodal_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  return ComputeSGS_DDAnisotropicTrainingDataNodal(ctx, Q, in, out, STATEVAR_PRIMITIVE);
-}
diff --git a/examples/fluids/qfunctions/sgs_dd_utils.h b/examples/fluids/qfunctions/sgs_dd_utils.h
deleted file mode 100644
index 4bcb9fc181..0000000000
--- a/examples/fluids/qfunctions/sgs_dd_utils.h
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Structs and helper functions for data-driven subgrid-stress modeling
-/// See 'Invariant data-driven subgrid stress modeling in the strain-rate eigenframe for large eddy simulation' 2022 and 'S-frame discrepancy
-/// correction models for data-informed Reynolds stress closure' 2022
-#pragma once
-
-#include <ceed.h>
-
-#include "newtonian_state.h"
-#include "newtonian_types.h"
-#include "utils.h"
-#include "utils_eigensolver_jacobi.h"
-
-// @brief Calculate Frobenius norm of velocity gradient from eigenframe quantities
-CEED_QFUNCTION_HELPER CeedScalar VelocityGradientMagnitude(const CeedScalar strain_sframe[3], const CeedScalar vorticity_sframe[3]) {
-  return sqrt(Dot3(strain_sframe, strain_sframe) + 0.5 * Dot3(vorticity_sframe, vorticity_sframe));
-};
-
-// @brief Change the order of basis vectors so that they align with vector and obey right-hand rule
-// @details The e_1 and e_3 basis vectors are the closest aligned to the vector. The e_2 is set via  e_3 x e_1
-// The basis vectors are assumed to form the rows of the basis matrix.
-CEED_QFUNCTION_HELPER void OrientBasisWithVector(CeedScalar basis[3][3], const CeedScalar vector[3]) {
-  CeedScalar alignment[3] = {0.}, cross[3];
-
-  MatVec3(basis, vector, CEED_NOTRANSPOSE, alignment);
-
-  if (alignment[0] < 0) ScaleN(basis[0], -1, 3);
-  if (alignment[2] < 0) ScaleN(basis[2], -1, 3);
-
-  Cross3(basis[2], basis[0], cross);
-  CeedScalar basis_1_orientation = Dot3(cross, basis[1]);
-  if (basis_1_orientation < 0) ScaleN(basis[1], -1, 3);
-}
-
-// @brief Denormalize outputs using min-max (de-)normalization
-CEED_QFUNCTION_HELPER void DenormalizeDDOutputs(CeedScalar output[6], const CeedScalar new_bounds[6][2], const CeedScalar old_bounds[6][2]) {
-  CeedScalar bounds_ratio;
-  for (int i = 0; i < 6; i++) {
-    bounds_ratio = (new_bounds[i][1] - new_bounds[i][0]) / (old_bounds[i][1] - old_bounds[i][0]);
-    output[i]    = bounds_ratio * (output[i] - old_bounds[i][1]) + new_bounds[i][1];
-  }
-}
-
-/**
- * @brief Compute model inputs for anisotropic data-driven model
- *
- * @param[in]  grad_velo_aniso     Gradient of velocity in physical (anisotropic) coordinates
- * @param[in]  km_A_ij             Anisotropy tensor, in Kelvin-Mandel notation
- * @param[in]  delta               Length used to create anisotropy tensor
- * @param[in]  viscosity           Kinematic viscosity
- * @param[out] eigenvectors        Eigenvectors of the (anisotropic) velocity gradient
- * @param[out] inputs              Data-driven model inputs
- * @param[out] grad_velo_magnitude Frobenius norm of the velocity gradient
- */
-CEED_QFUNCTION_HELPER void ComputeSgsDDInputs(const CeedScalar grad_velo_aniso[3][3], const CeedScalar km_A_ij[6], const CeedScalar delta,
-                                              const CeedScalar viscosity, CeedScalar eigenvectors[3][3], CeedScalar inputs[6],
-                                              CeedScalar *grad_velo_magnitude) {
-  CeedScalar strain_sframe[3] = {0.}, vorticity_sframe[3] = {0.};
-  CeedScalar A_ij[3][3] = {{0.}}, grad_velo_iso[3][3] = {{0.}};
-
-  // -- Transform physical, anisotropic velocity gradient to isotropic
-  KMUnpack(km_A_ij, A_ij);
-  MatMat3(grad_velo_aniso, A_ij, CEED_NOTRANSPOSE, CEED_NOTRANSPOSE, grad_velo_iso);
-
-  {  // -- Get Eigenframe
-    CeedScalar kmstrain_iso[6], strain_iso[3][3];
-    CeedInt    work_vector[3] = {0};
-    KMStrainRate(grad_velo_iso, kmstrain_iso);
-    KMUnpack(kmstrain_iso, strain_iso);
-    Diagonalize3(strain_iso, strain_sframe, eigenvectors, work_vector, SORT_DECREASING_EVALS, true, 5);
-  }
-
-  {  // -- Get vorticity in S-frame
-    CeedScalar rotation_iso[3][3];
-    RotationRate(grad_velo_iso, rotation_iso);
-    CeedScalar vorticity_iso[3] = {-2 * rotation_iso[1][2], 2 * rotation_iso[0][2], -2 * rotation_iso[0][1]};
-    OrientBasisWithVector(eigenvectors, vorticity_iso);
-    MatVec3(eigenvectors, vorticity_iso, CEED_NOTRANSPOSE, vorticity_sframe);
-  }
-
-  // -- Calculate DD model inputs
-  *grad_velo_magnitude = VelocityGradientMagnitude(strain_sframe, vorticity_sframe);
-  inputs[0]            = strain_sframe[0];
-  inputs[1]            = strain_sframe[1];
-  inputs[2]            = strain_sframe[2];
-  inputs[3]            = vorticity_sframe[0];
-  inputs[4]            = vorticity_sframe[1];
-  inputs[5]            = viscosity / Square(delta);
-  ScaleN(inputs, 1 / (*grad_velo_magnitude + CEED_EPSILON), 6);
-}
-
-/**
- * @brief Compute the physical SGS stresses from the neural-network output
- *
- * @param[in,out] outputs             Outputs from the neural-network
- * @param[in]     delta               Length used to create anisotropy tensor
- * @param[in]     eigenvectors        Eigenvectors of the (anisotropic) velocity gradient
- * @param[in]     new_bounds          Bounds used for min-max de-normalization
- * @param[in]     grad_velo_magnitude Magnitude of the velocity gradient
- * @param[out]    kmsgs_stress        Physical SGS stresses in Kelvin-Mandel notation
- */
-CEED_QFUNCTION_HELPER void ComputeSgsDDOutputs(CeedScalar outputs[6], const CeedScalar delta, const CeedScalar eigenvectors[3][3],
-                                               const CeedScalar new_bounds[6][2], const CeedScalar grad_velo_magnitude, CeedScalar kmsgs_stress[6]) {
-  CeedScalar old_bounds[6][2] = {{0}};
-  for (int j = 0; j < 6; j++) old_bounds[j][1] = 1;
-  DenormalizeDDOutputs(outputs, new_bounds, old_bounds);
-
-  // Re-dimensionalize sgs_stress
-  ScaleN(outputs, Square(delta) * Square(grad_velo_magnitude), 6);
-
-  CeedScalar sgs_stress[3][3] = {{0.}};
-  {  // Rotate SGS Stress back to physical frame, SGS_physical = E^T SGS_sframe E
-    CeedScalar       Evec_sgs[3][3]   = {{0.}};
-    const CeedScalar sgs_sframe[3][3] = {
-        {outputs[0], outputs[3], outputs[4]},
-        {outputs[3], outputs[1], outputs[5]},
-        {outputs[4], outputs[5], outputs[2]},
-    };
-    MatMat3(eigenvectors, sgs_sframe, CEED_TRANSPOSE, CEED_NOTRANSPOSE, Evec_sgs);
-    MatMat3(Evec_sgs, eigenvectors, CEED_NOTRANSPOSE, CEED_NOTRANSPOSE, sgs_stress);
-  }
-
-  KMPack(sgs_stress, kmsgs_stress);
-}
diff --git a/examples/fluids/qfunctions/shocktube.h b/examples/fluids/qfunctions/shocktube.h
index 87cdf73d4d..64e0798a44 100644
--- a/examples/fluids/qfunctions/shocktube.h
+++ b/examples/fluids/qfunctions/shocktube.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -10,8 +10,11 @@
 
 // Model from:
 //   On the Order of Accuracy and Numerical Performance of Two Classes of Finite Volume WENO Schemes, Zhang, Zhang, and Shu (2011).
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#include <stdbool.h>
+#endif
 
 #include "utils.h"
 
diff --git a/examples/fluids/qfunctions/stabilization.h b/examples/fluids/qfunctions/stabilization.h
index 55d99820c3..87f05823aa 100644
--- a/examples/fluids/qfunctions/stabilization.h
+++ b/examples/fluids/qfunctions/stabilization.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,7 @@
 
 /// @file
 /// Helper functions for computing stabilization terms of a newtonian simulation
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "newtonian_state.h"
 
diff --git a/examples/fluids/qfunctions/stabilization_types.h b/examples/fluids/qfunctions/stabilization_types.h
index 97492dd73a..8544e428e9 100644
--- a/examples/fluids/qfunctions/stabilization_types.h
+++ b/examples/fluids/qfunctions/stabilization_types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/stg_shur14.h b/examples/fluids/qfunctions/stg_shur14.h
index d6c7464660..2e8b05db1c 100644
--- a/examples/fluids/qfunctions/stg_shur14.h
+++ b/examples/fluids/qfunctions/stg_shur14.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -12,9 +12,11 @@
 /// SetupSTG_Rand reads in the input files and fills in STGShur14Context.
 /// Then STGShur14_CalcQF is run over quadrature points.
 /// Before the program exits, TearDownSTG is run to free the memory of the allocated arrays.
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
 #include <stdlib.h>
+#endif
 
 #include "newtonian_state.h"
 #include "setupgeo_helpers.h"
@@ -102,12 +104,12 @@ CEED_QFUNCTION_HELPER CeedScalar Calc_qn(const CeedScalar kappa, const CeedScala
 }
 
 // Calculate hmax, ke, keta, and kcut
-CEED_QFUNCTION_HELPER void SpectrumConstants(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar h[3],
+CEED_QFUNCTION_HELPER void SpectrumConstants(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar hNodSep[3],
                                              const CeedScalar nu, CeedScalar *hmax, CeedScalar *ke, CeedScalar *keta, CeedScalar *kcut) {
-  *hmax = Max(Max(h[0], h[1]), h[2]);
+  *hmax = Max(Max(hNodSep[0], hNodSep[1]), hNodSep[2]);
   *ke   = wall_dist == 0 ? 1e16 : 2 * M_PI / Min(2 * wall_dist, 3 * lt);
   *keta = 2 * M_PI * pow(Cube(nu) / eps, -0.25);
-  *kcut = M_PI / Min(Max(Max(h[1], h[2]), 0.3 * (*hmax)) + 0.1 * wall_dist, *hmax);
+  *kcut = M_PI / Min(Max(Max(hNodSep[1], hNodSep[2]), 0.3 * (*hmax)) + 0.1 * wall_dist, *hmax);
 }
 
 /*
@@ -115,21 +117,21 @@ CEED_QFUNCTION_HELPER void SpectrumConstants(const CeedScalar wall_dist, const C
  *
  * Calculates q_n at a given distance to the wall
  *
- * @param[in]  wall_dist Distance to the nearest wall
- * @param[in]  eps       Turbulent dissipation w/rt wall_dist
- * @param[in]  lt        Turbulent length scale w/rt wall_dist
- * @param[in]  h         Element lengths in coordinate directions
- * @param[in]  nu        Dynamic Viscosity;
- * @param[in]  stg_ctx   STGShur14Context for the problem
- * @param[out] qn        Spectrum coefficients, [nmodes]
+ * @param[in]  wall_dist  Distance to the nearest wall
+ * @param[in]  eps        Turbulent dissipation w/rt wall_dist
+ * @param[in]  lt         Turbulent length scale w/rt wall_dist
+ * @param[in]  h_node_sep Element lengths in coordinate directions
+ * @param[in]  nu         Dynamic Viscosity;
+ * @param[in]  stg_ctx    STGShur14Context for the problem
+ * @param[out] qn         Spectrum coefficients, [nmodes]
  */
-CEED_QFUNCTION_HELPER void CalcSpectrum(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar h[3],
+CEED_QFUNCTION_HELPER void CalcSpectrum(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar h_node_sep[3],
                                         const CeedScalar nu, CeedScalar qn[], const StgShur14Context stg_ctx) {
   const CeedInt     nmodes = stg_ctx->nmodes;
   const CeedScalar *kappa  = &stg_ctx->data[stg_ctx->offsets.kappa];
   CeedScalar        hmax, ke, keta, kcut, Ektot = 0.0;
 
-  SpectrumConstants(wall_dist, eps, lt, h, nu, &hmax, &ke, &keta, &kcut);
+  SpectrumConstants(wall_dist, eps, lt, h_node_sep, nu, &hmax, &ke, &keta, &kcut);
 
   for (CeedInt n = 0; n < nmodes; n++) {
     const CeedScalar dkappa = n == 0 ? kappa[0] : kappa[n] - kappa[n - 1];
@@ -181,28 +183,29 @@ CEED_QFUNCTION_HELPER void StgShur14Calc(const CeedScalar X[3], const CeedScalar
 /******************************************************
  * @brief Calculate u(x,t) for STG inflow condition
  *
- * @param[in]  X         Location to evaluate u(X,t)
- * @param[in]  t         Time to evaluate u(X,t)
- * @param[in]  ubar      Mean velocity at X
- * @param[in]  cij       Cholesky decomposition at X
- * @param[in]  Ektot     Total spectrum energy at this location
- * @param[in]  h         Element size in 3 directions
- * @param[in]  wall_dist Distance to closest wall
- * @param[in]  eps       Turbulent dissipation
- * @param[in]  lt        Turbulent length scale
- * @param[out] u         Velocity at X and t
- * @param[in]  stg_ctx   STGShur14Context for the problem
+ * @param[in]  X          Location to evaluate u(X,t)
+ * @param[in]  t          Time to evaluate u(X,t)
+ * @param[in]  ubar       Mean velocity at X
+ * @param[in]  cij        Cholesky decomposition at X
+ * @param[in]  Ektot      Total spectrum energy at this location
+ * @param[in]  h_node_sep Element size in 3 directions
+ * @param[in]  wall_dist  Distance to closest wall
+ * @param[in]  eps        Turbulent dissipation
+ * @param[in]  lt         Turbulent length scale
+ * @param[out] u          Velocity at X and t
+ * @param[in]  stg_ctx    STGShur14Context for the problem
  */
 CEED_QFUNCTION_HELPER void StgShur14Calc_PrecompEktot(const CeedScalar X[3], const CeedScalar t, const CeedScalar ubar[3], const CeedScalar cij[6],
-                                                      const CeedScalar Ektot, const CeedScalar h[3], const CeedScalar wall_dist, const CeedScalar eps,
-                                                      const CeedScalar lt, const CeedScalar nu, CeedScalar u[3], const StgShur14Context stg_ctx) {
+                                                      const CeedScalar Ektot, const CeedScalar h_node_sep[3], const CeedScalar wall_dist,
+                                                      const CeedScalar eps, const CeedScalar lt, const CeedScalar nu, CeedScalar u[3],
+                                                      const StgShur14Context stg_ctx) {
   const CeedInt     nmodes = stg_ctx->nmodes;
   const CeedScalar *kappa  = &stg_ctx->data[stg_ctx->offsets.kappa];
   const CeedScalar *phi    = &stg_ctx->data[stg_ctx->offsets.phi];
   const CeedScalar *sigma  = &stg_ctx->data[stg_ctx->offsets.sigma];
   const CeedScalar *d      = &stg_ctx->data[stg_ctx->offsets.d];
   CeedScalar        hmax, ke, keta, kcut;
-  SpectrumConstants(wall_dist, eps, lt, h, nu, &hmax, &ke, &keta, &kcut);
+  SpectrumConstants(wall_dist, eps, lt, h_node_sep, nu, &hmax, &ke, &keta, &kcut);
   CeedScalar xdotd, vp[3] = {0.};
   CeedScalar xhat[] = {0., X[1], X[2]};
 
@@ -254,12 +257,13 @@ CEED_QFUNCTION(StgShur14Preprocess)(void *ctx, CeedInt Q, const CeedScalar *cons
         {dXdx_q[1][0][i], dXdx_q[1][1][i], dXdx_q[1][2][i]},
     };
 
-    CeedScalar h[3];
-    h[0] = dx;
-    for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(dXdx[0][j] * dXdx[0][j] + dXdx[1][j] * dXdx[1][j]);
+    CeedScalar h_node_sep[3];
+    h_node_sep[0] = dx;
+    for (CeedInt j = 1; j < 3; j++) h_node_sep[j] = 2 / sqrt(dXdx[0][j] * dXdx[0][j] + dXdx[1][j] * dXdx[1][j]);
+    ScaleN(h_node_sep, stg_ctx->h_scale_factor, 3);
 
     InterpolateProfile(wall_dist, ubar, cij, &eps, &lt, stg_ctx);
-    SpectrumConstants(wall_dist, eps, lt, h, nu, &hmax, &ke, &keta, &kcut);
+    SpectrumConstants(wall_dist, eps, lt, h_node_sep, nu, &hmax, &ke, &keta, &kcut);
 
     // Calculate total TKE per spectrum
     CeedScalar Ek_tot = 0;
@@ -279,48 +283,38 @@ CEED_QFUNCTION(ICsStg)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSc
   const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1];
   CeedScalar(*q0)[CEED_Q_VLA]         = (CeedScalar(*)[CEED_Q_VLA])out[0];
 
-  const StgShur14Context stg_ctx = (StgShur14Context)ctx;
-  CeedScalar             qn[STG_NMODES_MAX], u[3], ubar[3], cij[6], eps, lt;
-  const CeedScalar       dx     = stg_ctx->dx;
-  const CeedScalar       time   = stg_ctx->time;
-  const CeedScalar       theta0 = stg_ctx->theta0;
-  const CeedScalar       P0     = stg_ctx->P0;
-  const CeedScalar       cv     = stg_ctx->newtonian_ctx.cv;
-  const CeedScalar       rho    = P0 / (GasConstant(&stg_ctx->newtonian_ctx) * theta0);
-  const CeedScalar       nu     = stg_ctx->newtonian_ctx.mu / rho;
+  const StgShur14Context         stg_ctx = (StgShur14Context)ctx;
+  const NewtonianIdealGasContext gas     = &stg_ctx->newtonian_ctx;
+  CeedScalar                     qn[STG_NMODES_MAX], u[3], ubar[3], cij[6], eps, lt;
+  const CeedScalar               dx     = stg_ctx->dx;
+  const CeedScalar               time   = stg_ctx->time;
+  const CeedScalar               theta0 = stg_ctx->theta0;
+  const CeedScalar               P0     = stg_ctx->P0;
+  const CeedScalar               rho    = P0 / (GasConstant(gas) * theta0);
+  const CeedScalar               nu     = gas->mu / rho;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     const CeedScalar x_i[3] = {x[0][i], x[1][i], x[2][i]};
     CeedScalar       dXdx[3][3];
     InvertMappingJacobian_3D(Q, i, J, dXdx, NULL);
-    CeedScalar h[3];
-    h[0] = dx;
-    for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]) + Square(dXdx[2][j]));
+    CeedScalar h_node_sep[3];
+    h_node_sep[0] = dx;
+    for (CeedInt j = 1; j < 3; j++) h_node_sep[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]) + Square(dXdx[2][j]));
+    ScaleN(h_node_sep, stg_ctx->h_scale_factor, 3);
 
     InterpolateProfile(x_i[1], ubar, cij, &eps, &lt, stg_ctx);
     if (stg_ctx->use_fluctuating_IC) {
-      CalcSpectrum(x_i[1], eps, lt, h, nu, qn, stg_ctx);
+      CalcSpectrum(x_i[1], eps, lt, h_node_sep, nu, qn, stg_ctx);
       StgShur14Calc(x_i, time, ubar, cij, qn, u, stg_ctx);
     } else {
       for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j];
     }
 
-    switch (stg_ctx->newtonian_ctx.state_var) {
-      case STATEVAR_CONSERVATIVE:
-        q0[0][i] = rho;
-        q0[1][i] = u[0] * rho;
-        q0[2][i] = u[1] * rho;
-        q0[3][i] = u[2] * rho;
-        q0[4][i] = rho * (0.5 * Dot3(u, u) + cv * theta0);
-        break;
-
-      case STATEVAR_PRIMITIVE:
-        q0[0][i] = P0;
-        q0[1][i] = u[0];
-        q0[2][i] = u[1];
-        q0[3][i] = u[2];
-        q0[4][i] = theta0;
-        break;
+    CeedScalar Y[5] = {P0, u[0], u[1], u[2], theta0}, q[5] = {0.};
+    State      s = StateFromY(gas, Y);
+    StateToQ(gas, s, q, gas->state_var);
+    for (CeedInt j = 0; j < 5; j++) {
+      q0[j][i] = q[j];
     }
   }
   return 0;
@@ -361,13 +355,14 @@ CEED_QFUNCTION(StgShur14Inflow)(void *ctx, CeedInt Q, const CeedScalar *const *i
     QdataBoundaryUnpack_3D(Q, i, q_data_sur, &wdetJb, dXdx, norm);
     wdetJb *= is_implicit ? -1. : 1.;
 
-    CeedScalar h[3];
-    h[0] = dx;
-    for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]));
+    CeedScalar h_node_sep[3];
+    h_node_sep[0] = dx;
+    for (CeedInt j = 1; j < 3; j++) h_node_sep[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]));
+    ScaleN(h_node_sep, stg_ctx->h_scale_factor, 3);
 
     InterpolateProfile(X[1][i], ubar, cij, &eps, &lt, stg_ctx);
     if (!mean_only) {
-      CalcSpectrum(X[1][i], eps, lt, h, mu / rho, qn, stg_ctx);
+      CalcSpectrum(X[1][i], eps, lt, h_node_sep, mu / rho, qn, stg_ctx);
       StgShur14Calc(x, time, ubar, cij, qn, u, stg_ctx);
     } else {
       for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j];
@@ -477,15 +472,16 @@ CEED_QFUNCTION(StgShur14InflowStrongQF)(void *ctx, CeedInt Q, const CeedScalar *
   const CeedScalar(*inv_Ektotal)           = (const CeedScalar(*))in[3];
   CeedScalar(*bcval)[CEED_Q_VLA]           = (CeedScalar(*)[CEED_Q_VLA])out[0];
 
-  const StgShur14Context stg_ctx = (StgShur14Context)ctx;
-  CeedScalar             u[3], ubar[3], cij[6], eps, lt;
-  const bool             mean_only = stg_ctx->mean_only;
-  const CeedScalar       dx        = stg_ctx->dx;
-  const CeedScalar       time      = stg_ctx->time;
-  const CeedScalar       theta0    = stg_ctx->theta0;
-  const CeedScalar       P0        = stg_ctx->P0;
-  const CeedScalar       rho       = P0 / (GasConstant(&stg_ctx->newtonian_ctx) * theta0);
-  const CeedScalar       nu        = stg_ctx->newtonian_ctx.mu / rho;
+  const StgShur14Context         stg_ctx = (StgShur14Context)ctx;
+  const NewtonianIdealGasContext gas     = &stg_ctx->newtonian_ctx;
+  CeedScalar                     u[3], ubar[3], cij[6], eps, lt;
+  const bool                     mean_only = stg_ctx->mean_only;
+  const CeedScalar               dx        = stg_ctx->dx;
+  const CeedScalar               time      = stg_ctx->time;
+  const CeedScalar               theta0    = stg_ctx->theta0;
+  const CeedScalar               P0        = stg_ctx->P0;
+  const CeedScalar               rho       = P0 / (GasConstant(gas) * theta0);
+  const CeedScalar               nu        = gas->mu / rho;
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     const CeedScalar x[]        = {coords[0][i], coords[1][i], coords[2][i]};
@@ -494,40 +490,41 @@ CEED_QFUNCTION(StgShur14InflowStrongQF)(void *ctx, CeedInt Q, const CeedScalar *
         {dXdx_q[1][0][i], dXdx_q[1][1][i], dXdx_q[1][2][i]},
     };
 
-    CeedScalar h[3];
-    h[0] = dx;
-    for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]));
+    CeedScalar h_node_sep[3];
+    h_node_sep[0] = dx;
+    for (CeedInt j = 1; j < 3; j++) h_node_sep[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]));
+    ScaleN(h_node_sep, stg_ctx->h_scale_factor, 3);
 
     InterpolateProfile(coords[1][i], ubar, cij, &eps, &lt, stg_ctx);
     if (!mean_only) {
       if (1) {
-        StgShur14Calc_PrecompEktot(x, time, ubar, cij, inv_Ektotal[i], h, x[1], eps, lt, nu, u, stg_ctx);
+        StgShur14Calc_PrecompEktot(x, time, ubar, cij, inv_Ektotal[i], h_node_sep, x[1], eps, lt, nu, u, stg_ctx);
       } else {  // Original way
         CeedScalar qn[STG_NMODES_MAX];
-        CalcSpectrum(coords[1][i], eps, lt, h, nu, qn, stg_ctx);
+        CalcSpectrum(coords[1][i], eps, lt, h_node_sep, nu, qn, stg_ctx);
         StgShur14Calc(x, time, ubar, cij, qn, u, stg_ctx);
       }
     } else {
       for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j];
     }
 
-    switch (stg_ctx->newtonian_ctx.state_var) {
+    CeedScalar Y[5] = {P0, u[0], u[1], u[2], theta0}, q[5] = {0.};
+    State      s = StateFromY(gas, Y);
+    StateToQ(gas, s, q, gas->state_var);
+    switch (gas->state_var) {
       case STATEVAR_CONSERVATIVE:
-        bcval[0][i] = scale[i] * rho;
-        bcval[1][i] = scale[i] * rho * u[0];
-        bcval[2][i] = scale[i] * rho * u[1];
-        bcval[3][i] = scale[i] * rho * u[2];
-        bcval[4][i] = 0.;
+        q[4] = 0.;  // Don't set energy
         break;
-
       case STATEVAR_PRIMITIVE:
-        bcval[0][i] = 0;
-        bcval[1][i] = scale[i] * u[0];
-        bcval[2][i] = scale[i] * u[1];
-        bcval[3][i] = scale[i] * u[2];
-        bcval[4][i] = scale[i] * theta0;
+        q[0] = 0;  // Don't set pressure
+        break;
+      case STATEVAR_ENTROPY:
+        q[0] = 0;  // Don't set V_density
         break;
     }
+    for (CeedInt j = 0; j < 5; j++) {
+      bcval[j][i] = scale[i] * q[j];
+    }
   }
   return 0;
 }
diff --git a/examples/fluids/qfunctions/stg_shur14_type.h b/examples/fluids/qfunctions/stg_shur14_type.h
index a8ed21c292..945956de84 100644
--- a/examples/fluids/qfunctions/stg_shur14_type.h
+++ b/examples/fluids/qfunctions/stg_shur14_type.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -6,7 +6,10 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <stdbool.h>
+#endif
 
 #include "newtonian_types.h"
 
@@ -25,6 +28,7 @@ struct STGShur14Context_ {
   bool                             is_implicit;         // !< Whether using implicit time integration
   bool                             mean_only;           // !< Only apply the mean profile
   CeedScalar                       dx;                  // !< dx used for h calculation
+  CeedScalar                       h_scale_factor;      // !< Scales the element size
   bool                             prescribe_T;         // !< Prescribe temperature weakly
   bool                             use_fluctuating_IC;  // !< Only apply the mean profile
   struct NewtonianIdealGasContext_ newtonian_ctx;
diff --git a/examples/fluids/qfunctions/strong_boundary_conditions.h b/examples/fluids/qfunctions/strong_boundary_conditions.h
index a503a236d9..1526580963 100644
--- a/examples/fluids/qfunctions/strong_boundary_conditions.h
+++ b/examples/fluids/qfunctions/strong_boundary_conditions.h
@@ -1,10 +1,10 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "setupgeo_helpers.h"
 
diff --git a/examples/fluids/qfunctions/taylorgreen.h b/examples/fluids/qfunctions/taylorgreen.h
index 72c128400d..c28e718913 100644
--- a/examples/fluids/qfunctions/taylorgreen.h
+++ b/examples/fluids/qfunctions/taylorgreen.h
@@ -1,11 +1,13 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #include "newtonian_state.h"
 #include "newtonian_types.h"
@@ -17,12 +19,12 @@ CEED_QFUNCTION(ICsTaylorGreen)(void *ctx, CeedInt Q, const CeedScalar *const *in
 
   CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
 
-  const SetupContext                context   = (SetupContext)ctx;
-  struct NewtonianIdealGasContext_ *gas       = &context->gas;
-  CeedScalar                        R         = GasConstant(gas);
-  StatePrimitive                    reference = context->reference;
-  const CeedScalar                  V0        = sqrt(Dot3(reference.velocity, reference.velocity));
-  const CeedScalar                  density0  = reference.pressure / (reference.temperature * R);
+  const SetupContext             context   = (SetupContext)ctx;
+  const NewtonianIdealGasContext gas       = &context->gas;
+  CeedScalar                     R         = GasConstant(gas);
+  StatePrimitive                 reference = context->reference;
+  const CeedScalar               V0        = sqrt(Dot3(reference.velocity, reference.velocity));
+  const CeedScalar               density0  = reference.pressure / (reference.temperature * R);
 
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     CeedScalar x[]  = {X[0][i], X[1][i], X[2][i]};
@@ -36,15 +38,7 @@ CEED_QFUNCTION(ICsTaylorGreen)(void *ctx, CeedInt Q, const CeedScalar *const *in
     Y[4] = reference.temperature;
 
     State s = StateFromY(gas, Y);
-    switch (gas->state_var) {
-      case STATEVAR_CONSERVATIVE:
-        UnpackState_U(s.U, q);
-        break;
-      case STATEVAR_PRIMITIVE:
-        UnpackState_Y(s.Y, q);
-        break;
-    }
-
+    StateToQ(gas, s, q, gas->state_var);
     for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j];
   }
   return 0;
diff --git a/examples/fluids/qfunctions/turb_spanstats.h b/examples/fluids/qfunctions/turb_spanstats.h
index dccba29a7b..6331b119e9 100644
--- a/examples/fluids/qfunctions/turb_spanstats.h
+++ b/examples/fluids/qfunctions/turb_spanstats.h
@@ -1,10 +1,10 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "newtonian_state.h"
 #include "turb_stats_types.h"
@@ -59,6 +59,10 @@ CEED_QFUNCTION(ChildStatsCollection_Prim)(void *ctx, CeedInt Q, const CeedScalar
   return ChildStatsCollection(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
 
+CEED_QFUNCTION(ChildStatsCollection_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return ChildStatsCollection(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
+
 // QFunctions for testing
 CEED_QFUNCTION_HELPER CeedScalar ChildStatsCollectionTest_Exact(const CeedScalar x_i[3]) { return x_i[0] + Square(x_i[1]); }
 
diff --git a/examples/fluids/qfunctions/turb_stats_types.h b/examples/fluids/qfunctions/turb_stats_types.h
index 95136f9ff0..dccae3653a 100644
--- a/examples/fluids/qfunctions/turb_stats_types.h
+++ b/examples/fluids/qfunctions/turb_stats_types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/qfunctions/utils.h b/examples/fluids/qfunctions/utils.h
index f414e14e9c..bd9d787efc 100644
--- a/examples/fluids/qfunctions/utils.h
+++ b/examples/fluids/qfunctions/utils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -6,8 +6,10 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #ifndef M_PI
 #define M_PI 3.14159265358979323846
diff --git a/examples/fluids/qfunctions/utils_eigensolver_jacobi.h b/examples/fluids/qfunctions/utils_eigensolver_jacobi.h
index b8236789d2..71587633dd 100644
--- a/examples/fluids/qfunctions/utils_eigensolver_jacobi.h
+++ b/examples/fluids/qfunctions/utils_eigensolver_jacobi.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -9,8 +9,11 @@
 /// Eigen system solver for symmetric NxN matrices. Modified from the CC0 code provided at https://github.com/jewettaij/jacobi_pd
 #pragma once
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#include <stdbool.h>
+#endif
 
 #include "utils.h"
 
diff --git a/examples/fluids/qfunctions/velocity_gradient_projection.h b/examples/fluids/qfunctions/velocity_gradient_projection.h
index 73b51eff84..2fecc3f258 100644
--- a/examples/fluids/qfunctions/velocity_gradient_projection.h
+++ b/examples/fluids/qfunctions/velocity_gradient_projection.h
@@ -1,10 +1,10 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "newtonian_state.h"
 #include "newtonian_types.h"
@@ -47,3 +47,7 @@ CEED_QFUNCTION(VelocityGradientProjectionRHS_Conserv)(void *ctx, CeedInt Q, cons
 CEED_QFUNCTION(VelocityGradientProjectionRHS_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   return VelocityGradientProjectionRHS(ctx, Q, in, out, STATEVAR_PRIMITIVE);
 }
+
+CEED_QFUNCTION(VelocityGradientProjectionRHS_Entropy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  return VelocityGradientProjectionRHS(ctx, Q, in, out, STATEVAR_ENTROPY);
+}
diff --git a/examples/fluids/smartsim_regression_framework.py b/examples/fluids/smartsim_regression_framework.py
deleted file mode 100755
index 2834263e6a..0000000000
--- a/examples/fluids/smartsim_regression_framework.py
+++ /dev/null
@@ -1,241 +0,0 @@
-#!/usr/bin/env python3
-from junit_xml import TestCase
-from smartsim import Experiment
-from smartsim.settings import RunSettings
-from smartredis import Client
-import numpy as np
-from pathlib import Path
-import argparse
-import traceback
-import sys
-import time
-from typing import Tuple
-import os
-import shutil
-import logging
-import socket
-
-# autopep8 off
-sys.path.insert(0, (Path(__file__).parents[3] / "tests/junit-xml").as_posix())
-# autopep8 on
-
-logging.disable(logging.WARNING)
-
-fluids_example_dir = Path(__file__).parent.absolute()
-
-
-def getOpenSocket():
-    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.bind(('', 0))
-    addr = s.getsockname()
-    s.close()
-    return addr[1]
-
-
-class NoError(Exception):
-    pass
-
-
-def assert_np_all(test, truth):
-    """Assert with better error reporting"""
-    try:
-        assert np.all(test == truth)
-    except Exception as e:
-        raise Exception(f"Expected {truth}, but got {test}") from e
-
-
-def assert_equal(test, truth):
-    """Assert with better error reporting"""
-    try:
-        assert test == truth
-    except Exception as e:
-        raise Exception(f"Expected {truth}, but got {test}") from e
-
-
-def verify_training_data(database_array, correct_array, ceed_resource, atol=1e-8, rtol=1e-8):
-    """Verify the training data
-
-    Cannot just use np.allclose due to vorticity vector directionality.
-    Check whether the S-frame-oriented vorticity vector's second component is just flipped.
-    This can happen due to the eigenvector ordering changing based on whichever one is closest to the vorticity vector.
-    If two eigenvectors are very close to the vorticity vector, this can cause the ordering to flip.
-    This flipping of the vorticity vector is not incorrect, just a known sensitivity of the model.
-    """
-    if not np.allclose(database_array, correct_array, atol=atol, rtol=rtol):
-
-        total_tolerances = atol + rtol * np.abs(correct_array)  # mimic np.allclose tolerance calculation
-        idx_notclose = np.where(np.abs(database_array - correct_array) > total_tolerances)
-        if not np.all(idx_notclose[1] == 4):
-            # values other than vorticity are not close
-            test_fail = True
-        else:
-            database_vorticity = database_array[idx_notclose]
-            correct_vorticity = correct_array[idx_notclose]
-            test_fail = False if np.allclose(-database_vorticity, correct_vorticity,
-                                             atol=atol, rtol=rtol) else True
-
-        if test_fail:
-            database_output_path = Path(
-                f"./y0_database_values_{ceed_resource.replace('/', '_')}.npy").absolute()
-            np.save(database_output_path, database_array)
-            raise AssertionError(f"Array values in database max difference: {np.max(np.abs(correct_array - database_array))}\n"
-                                 f"Array saved to {database_output_path.as_posix()}")
-
-
-class SmartSimTest(object):
-
-    def __init__(self, directory_path: Path):
-        self.exp: Experiment
-        self.database = None
-        self.directory_path: Path = directory_path
-        self.original_path: Path
-
-    def setup(self):
-        """To create the test directory and start SmartRedis database"""
-        self.original_path = Path(os.getcwd())
-
-        if self.directory_path.exists() and self.directory_path.is_dir():
-            shutil.rmtree(self.directory_path)
-        self.directory_path.mkdir()
-        os.chdir(self.directory_path)
-
-        PORT = getOpenSocket()
-        self.exp = Experiment("test", launcher="local")
-        self.database = self.exp.create_database(port=PORT, batch=False, interface="lo")
-        self.exp.generate(self.database)
-        self.exp.start(self.database)
-
-        # SmartRedis will complain if these aren't set
-        os.environ['SR_LOG_FILE'] = 'R'
-        os.environ['SR_LOG_LEVEL'] = 'INFO'
-
-    def test(self, ceed_resource) -> Tuple[bool, Exception, str]:
-        client = None
-        arguments = []
-        exe_path = "../../build/fluids-navierstokes"
-        try:
-            arguments = [
-                '-ceed', ceed_resource,
-                '-options_file', (fluids_example_dir / 'blasius.yaml').as_posix(),
-                '-ts_max_steps', '2',
-                '-diff_filter_grid_based_width',
-                '-ts_monitor', '-snes_monitor',
-                '-diff_filter_ksp_max_it', '50', '-diff_filter_ksp_monitor',
-                '-degree', '1',
-                '-sgs_train_enable',
-                '-sgs_train_write_data_interval', '2',
-                '-sgs_train_filter_width_scales', '1.2,3.1',
-                '-bc_symmetry_z',
-                '-dm_plex_shape', 'zbox',
-                '-dm_plex_box_bd', 'none,none,periodic',
-                '-dm_plex_box_faces', '4,6,1',
-                '-mesh_transform',
-            ]
-
-            run_settings = RunSettings(exe_path, exe_args=arguments)
-
-            client_exp = self.exp.create_model(f"client_{ceed_resource.replace('/', '_')}", run_settings)
-
-            # Start the client model
-            self.exp.start(client_exp, summary=False, block=True)
-
-            client = Client(cluster=False, address=self.database.get_address()[0])
-
-            assert client.poll_tensor("sizeInfo", 250, 5)
-            assert_np_all(client.get_tensor("sizeInfo"), np.array([35, 12, 6, 1, 1, 0]))
-
-            assert client.poll_tensor("check-run", 250, 5)
-            assert_equal(client.get_tensor("check-run")[0], 1)
-
-            assert client.poll_tensor("tensor-ow", 250, 5)
-            assert_equal(client.get_tensor("tensor-ow")[0], 1)
-
-            assert client.poll_tensor("num_filter_widths", 250, 5)
-            assert_equal(client.get_tensor("num_filter_widths")[0], 2)
-
-            assert client.poll_tensor("step", 250, 5)
-            assert_equal(client.get_tensor("step")[0], 2)
-
-            assert client.poll_tensor("y.0.0", 250, 5)
-            test_data_path = fluids_example_dir / "tests-output/y00_output.npy"
-            assert test_data_path.is_file()
-            correct_value = np.load(test_data_path)
-            database_value = client.get_tensor("y.0.0")
-            verify_training_data(database_value, correct_value, ceed_resource)
-
-            assert client.poll_tensor("y.0.1", 250, 5)
-            test_data_path = fluids_example_dir / "tests-output/y01_output.npy"
-            assert test_data_path.is_file()
-            correct_value = np.load(test_data_path)
-            database_value = client.get_tensor("y.0.1")
-            verify_training_data(database_value, correct_value, ceed_resource)
-
-            client.flush_db([os.environ["SSDB"]])
-            output = (True, NoError(), exe_path + ' ' + ' '.join(arguments))
-        except Exception as e:
-            output = (False, e, exe_path + ' ' + ' '.join(arguments))
-
-        finally:
-            if client:
-                client.flush_db([os.environ["SSDB"]])
-
-        return output
-
-    def test_junit(self, ceed_resource):
-        start: float = time.time()
-
-        passTest, exception, args = self.test(ceed_resource)
-
-        output = "" if isinstance(exception, NoError) else ''.join(
-            traceback.TracebackException.from_exception(exception).format())
-
-        test_case = TestCase(f'SmartSim Test {ceed_resource}',
-                             elapsed_sec=time.time() - start,
-                             timestamp=time.strftime(
-                                 '%Y-%m-%d %H:%M:%S %Z', time.localtime(start)),
-                             stdout=output,
-                             stderr=output,
-                             allow_multiple_subelements=True,
-                             category=f'SmartSim Tests')
-        test_case.args = args
-        if not passTest and 'occa' in ceed_resource:
-            test_case.add_skipped_info("OCCA mode not supported")
-        elif not passTest:
-            test_case.add_failure_info("exception", output)
-
-        return test_case
-
-    def teardown(self):
-        self.exp.stop(self.database)
-        os.chdir(self.original_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser('Testing script for SmartSim integration')
-    parser.add_argument(
-        '-c',
-        '--ceed-backends',
-        type=str,
-        nargs='*',
-        default=['/cpu/self'],
-        help='libCEED backend to use with convergence tests')
-    args = parser.parse_args()
-
-    test_dir = fluids_example_dir / "test_dir"
-    print("Setting up database...", end='')
-    test_framework = SmartSimTest(test_dir)
-    test_framework.setup()
-    print(" Done!")
-    for ceed_resource in args.ceed_backends:
-        print("working on " + ceed_resource + ' ...', end='')
-        passTest, exception, _ = test_framework.test(ceed_resource)
-
-        if passTest:
-            print("Passed!")
-        else:
-            print("Failed!", file=sys.stderr)
-            print('\t' + ''.join(traceback.TracebackException.from_exception(exception).format()), file=sys.stderr)
-
-    print("Cleaning up database...", end='')
-    test_framework.teardown()
-    print(" Done!")
diff --git a/examples/fluids/src/bc_definition.c b/examples/fluids/src/bc_definition.c
new file mode 100644
index 0000000000..acdb50a370
--- /dev/null
+++ b/examples/fluids/src/bc_definition.c
@@ -0,0 +1,106 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <bc_definition.h>
+
+/**
+   @brief Create `BCDefinition`
+
+   @param[in]  name             Name of the boundary condition
+   @param[in]  num_label_values Number of `DMLabel` values
+   @param[in]  label_values     Array of label values that define the boundaries controlled by the `BCDefinition`, size `num_label_values`
+   @param[out] bc_def           The new `BCDefinition`
+**/
+PetscErrorCode BCDefinitionCreate(const char *name, PetscInt num_label_values, PetscInt label_values[], BCDefinition *bc_def) {
+  PetscFunctionBeginUser;
+  PetscCall(PetscNew(bc_def));
+
+  PetscCall(PetscStrallocpy(name, &(*bc_def)->name));
+  (*bc_def)->num_label_values = num_label_values;
+  PetscCall(PetscMalloc1(num_label_values, &(*bc_def)->label_values));
+  for (PetscInt i = 0; i < num_label_values; i++) (*bc_def)->label_values[i] = label_values[i];
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+   @brief Get base information for `BCDefinition`
+
+   @param[in]  bc_def           `BCDefinition` to get information from
+   @param[out] name             Name of the `BCDefinition`
+   @param[out] num_label_values Number of `DMLabel` values
+   @param[out] label_values     Array of label values that define the boundaries controlled by the `BCDefinition`, size `num_label_values`
+**/
+PetscErrorCode BCDefinitionGetInfo(BCDefinition bc_def, const char *name[], PetscInt *num_label_values, const PetscInt *label_values[]) {
+  PetscFunctionBeginUser;
+  if (name) *name = bc_def->name;
+  if (label_values) {
+    *num_label_values = bc_def->num_label_values;
+    *label_values     = bc_def->label_values;
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+   @brief Destory a `BCDefinition` object
+
+   @param[in,out] bc_def `BCDefinition` to be destroyed
+**/
+PetscErrorCode BCDefinitionDestroy(BCDefinition *bc_def) {
+  PetscFunctionBeginUser;
+  if ((*bc_def)->name) PetscCall(PetscFree((*bc_def)->name));
+  if ((*bc_def)->label_values) PetscCall(PetscFree((*bc_def)->label_values));
+  if ((*bc_def)->essential_comps) PetscCall(PetscFree((*bc_def)->essential_comps));
+  PetscCall(PetscFree(*bc_def));
+  *bc_def = NULL;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+   @brief Set `DM_BC_ESSENTIAL` boundary condition values
+
+   @param[in,out] bc_def              `BCDefinition` to set values to
+   @param[in]     num_essential_comps Number of components to set
+   @param[in]     essential_comps     Array of components to set, size `num_essential_comps`
+**/
+PetscErrorCode BCDefinitionSetEssential(BCDefinition bc_def, PetscInt num_essential_comps, PetscInt essential_comps[]) {
+  PetscFunctionBeginUser;
+  bc_def->num_essential_comps = num_essential_comps;
+  PetscCall(PetscMalloc1(num_essential_comps, &bc_def->essential_comps));
+  PetscCall(PetscArraycpy(bc_def->essential_comps, essential_comps, num_essential_comps));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+   @brief Get `DM_BC_ESSENTIAL` boundary condition values
+
+   @param[in]  bc_def              `BCDefinition` to set values to
+   @param[out] num_essential_comps Number of components to set
+   @param[out] essential_comps     Array of components to set, size `num_essential_comps`
+**/
+PetscErrorCode BCDefinitionGetEssential(BCDefinition bc_def, PetscInt *num_essential_comps, const PetscInt *essential_comps[]) {
+  PetscFunctionBeginUser;
+  *num_essential_comps = bc_def->num_essential_comps;
+  *essential_comps     = bc_def->essential_comps;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+#define LABEL_ARRAY_SIZE 256
+
+// @brief See `PetscOptionsBCDefinition`
+PetscErrorCode PetscOptionsBCDefinition_Private(PetscOptionItems PetscOptionsObject, const char opt[], const char text[], const char man[],
+                                                const char name[], BCDefinition *bc_def, PetscBool *set) {
+  PetscInt num_label_values = LABEL_ARRAY_SIZE, label_values[LABEL_ARRAY_SIZE] = {0};
+
+  PetscFunctionBeginUser;
+  PetscCall(PetscOptionsIntArray(opt, text, man, label_values, &num_label_values, set));
+  if (num_label_values > 0) {
+    PetscCall(BCDefinitionCreate(name, num_label_values, label_values, bc_def));
+  } else {
+    *bc_def = NULL;
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
diff --git a/examples/fluids/src/boundary_condition.c b/examples/fluids/src/boundary_condition.c
new file mode 100644
index 0000000000..89e917634d
--- /dev/null
+++ b/examples/fluids/src/boundary_condition.c
@@ -0,0 +1,100 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "../navierstokes.h"
+
+/**
+   @brief Add `BCDefinition` to a `PetscSegBuffer`
+
+   @param[in]     bc_def      `BCDefinition` to add
+   @param[in,out] bc_defs_seg `PetscSegBuffer` to add to
+**/
+static PetscErrorCode AddBCDefinitionToSegBuffer(BCDefinition bc_def, PetscSegBuffer bc_defs_seg) {
+  BCDefinition *bc_def_ptr;
+
+  PetscFunctionBeginUser;
+  if (bc_def == NULL) PetscFunctionReturn(PETSC_SUCCESS);
+  PetscCall(PetscSegBufferGet(bc_defs_seg, 1, &bc_def_ptr));
+  *bc_def_ptr = bc_def;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+   @brief Create and setup `BCDefinition`s and `SimpleBC` from commandline options
+
+   @param[in]     user    `User`
+   @param[in,out] problem `ProblemData`
+   @param[in]     app_ctx `AppCtx`
+   @param[in,out] bc      `SimpleBC`
+**/
+PetscErrorCode BoundaryConditionSetUp(User user, ProblemData problem, AppCtx app_ctx, SimpleBC bc) {
+  PetscSegBuffer bc_defs_seg;
+  PetscBool      flg;
+  BCDefinition   bc_def;
+
+  PetscFunctionBeginUser;
+  PetscCall(PetscSegBufferCreate(sizeof(BCDefinition), 4, &bc_defs_seg));
+
+  PetscOptionsBegin(user->comm, NULL, "Boundary Condition Options", NULL);
+
+  PetscCall(PetscOptionsBCDefinition("-bc_wall", "Face IDs to apply wall BC", NULL, "wall", &bc_def, NULL));
+  PetscCall(AddBCDefinitionToSegBuffer(bc_def, bc_defs_seg));
+  if (bc_def) {
+    PetscInt num_essential_comps = 16, essential_comps[16];
+
+    PetscCall(PetscOptionsIntArray("-wall_comps", "An array of constrained component numbers", NULL, essential_comps, &num_essential_comps, &flg));
+    PetscCall(BCDefinitionSetEssential(bc_def, num_essential_comps, essential_comps));
+
+    app_ctx->wall_forces.num_wall = bc_def->num_label_values;
+    PetscCall(PetscMalloc1(bc_def->num_label_values, &app_ctx->wall_forces.walls));
+    PetscCall(PetscArraycpy(app_ctx->wall_forces.walls, bc_def->label_values, bc_def->num_label_values));
+  }
+
+  {  // Symmetry Boundary Conditions
+    const char *deprecated[3] = {"-bc_slip_x", "-bc_slip_y", "-bc_slip_z"};
+    const char *flags[3]      = {"-bc_symmetry_x", "-bc_symmetry_y", "-bc_symmetry_z"};
+
+    for (PetscInt j = 0; j < 3; j++) {
+      PetscCall(PetscOptionsDeprecated(deprecated[j], flags[j], "libCEED 0.12.0",
+                                       "Use -bc_symmetry_[x,y,z] for direct equivalency, or -bc_slip for weak, Riemann-based, direction-invariant "
+                                       "slip/no-penatration boundary conditions"));
+      PetscCall(PetscOptionsBCDefinition(flags[j], "Face IDs to apply symmetry BC", NULL, "symmetry", &bc_def, NULL));
+      if (!bc_def) {
+        PetscCall(PetscOptionsBCDefinition(deprecated[j], "Face IDs to apply symmetry BC", NULL, "symmetry", &bc_def, NULL));
+      }
+      PetscCall(AddBCDefinitionToSegBuffer(bc_def, bc_defs_seg));
+      if (bc_def) {
+        PetscInt essential_comps[1] = {j + 1};
+
+        PetscCall(BCDefinitionSetEssential(bc_def, 1, essential_comps));
+      }
+    }
+  }
+
+  // Inflow BCs
+  bc->num_inflow = 16;
+  PetscCall(PetscOptionsIntArray("-bc_inflow", "Face IDs to apply inflow BC", NULL, bc->inflows, &bc->num_inflow, NULL));
+  // Outflow BCs
+  bc->num_outflow = 16;
+  PetscCall(PetscOptionsIntArray("-bc_outflow", "Face IDs to apply outflow BC", NULL, bc->outflows, &bc->num_outflow, NULL));
+  // Freestream BCs
+  bc->num_freestream = 16;
+  PetscCall(PetscOptionsIntArray("-bc_freestream", "Face IDs to apply freestream BC", NULL, bc->freestreams, &bc->num_freestream, NULL));
+
+  bc->num_slip = 16;
+  PetscCall(PetscOptionsIntArray("-bc_slip", "Face IDs to apply slip BC", NULL, bc->slips, &bc->num_slip, NULL));
+
+  PetscOptionsEnd();
+
+  PetscCall(PetscSegBufferGetSize(bc_defs_seg, &problem->num_bc_defs));
+  PetscCall(PetscSegBufferExtractAlloc(bc_defs_seg, &problem->bc_defs));
+  PetscCall(PetscSegBufferDestroy(&bc_defs_seg));
+
+  //TODO: Verify that the BCDefinition don't have overlapping claims to boundary faces
+
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
diff --git a/examples/fluids/src/cloptions.c b/examples/fluids/src/cloptions.c
index 1fa601231a..905144216c 100644
--- a/examples/fluids/src/cloptions.c
+++ b/examples/fluids/src/cloptions.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -134,60 +134,6 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, SimpleBC
     strncpy(app_ctx->problem_name, problem_name, 16);
   }
 
-  // Wall Boundary Conditions
-  bc->num_wall = 16;
-  PetscBool flg;
-  PetscCall(PetscOptionsIntArray("-bc_wall", "Face IDs to apply wall BC", NULL, bc->walls, &bc->num_wall, NULL));
-  bc->num_comps = 5;
-  PetscCall(PetscOptionsIntArray("-wall_comps", "An array of constrained component numbers", NULL, bc->wall_comps, &bc->num_comps, &flg));
-
-  {  // Symmetry Boundary Conditions
-    const char *deprecated[3] = {"-bc_slip_x", "-bc_slip_y", "-bc_slip_z"};
-    const char *flags[3]      = {"-bc_symmetry_x", "-bc_symmetry_y", "-bc_symmetry_z"};
-    PetscBool   flg, has_symmetry = PETSC_FALSE;
-
-    for (PetscInt j = 0; j < 3; j++) {
-      bc->num_symmetry[j] = 16;
-      PetscCall(PetscOptionsDeprecated(deprecated[j], flags[j], "libCEED 0.12.0",
-                                       "Use -bc_symmetry_[x,y,z] for direct equivalency, or -bc_slip for weak, Riemann-based, direction-invariant "
-                                       "slip/no-penatration boundary conditions"));
-      PetscCall(PetscOptionsIntArray(flags[j], "Face IDs to apply symmetry BC", NULL, bc->symmetries[j], &bc->num_symmetry[j], &flg));
-      if (!flg) {
-        bc->num_symmetry[j] = 16;
-        PetscCall(PetscOptionsIntArray(deprecated[j], "Face IDs to apply slip BC", NULL, bc->symmetries[j], &bc->num_symmetry[j], &flg));
-      }
-      if (bc->num_symmetry[j] > 0) has_symmetry = PETSC_TRUE;
-    }
-
-    // Error if wall and symmetry BCs are set on the same face
-    if (has_symmetry) {
-      for (PetscInt c = 0; c < 3; c++) {
-        for (PetscInt s = 0; s < bc->num_symmetry[c]; s++) {
-          for (PetscInt w = 0; w < bc->num_wall; w++) {
-            PetscCheck(bc->symmetries[c][s] != bc->walls[w], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG,
-                       "Boundary condition already set on face %" PetscInt_FMT "!\n", bc->walls[w]);
-          }
-        }
-      }
-    }
-  }
-  app_ctx->wall_forces.num_wall = bc->num_wall;
-  PetscCall(PetscMalloc1(bc->num_wall, &app_ctx->wall_forces.walls));
-  PetscCall(PetscArraycpy(app_ctx->wall_forces.walls, bc->walls, bc->num_wall));
-
-  // Inflow BCs
-  bc->num_inflow = 16;
-  PetscCall(PetscOptionsIntArray("-bc_inflow", "Face IDs to apply inflow BC", NULL, bc->inflows, &bc->num_inflow, NULL));
-  // Outflow BCs
-  bc->num_outflow = 16;
-  PetscCall(PetscOptionsIntArray("-bc_outflow", "Face IDs to apply outflow BC", NULL, bc->outflows, &bc->num_outflow, NULL));
-  // Freestream BCs
-  bc->num_freestream = 16;
-  PetscCall(PetscOptionsIntArray("-bc_freestream", "Face IDs to apply freestream BC", NULL, bc->freestreams, &bc->num_freestream, NULL));
-
-  bc->num_slip = 16;
-  PetscCall(PetscOptionsIntArray("-bc_slip", "Face IDs to apply slip BC", NULL, bc->slips, &bc->num_slip, NULL));
-
   // Statistics Options
   app_ctx->turb_spanstats_collect_interval = 1;
   PetscCall(PetscOptionsInt("-ts_monitor_turbulence_spanstats_collect_interval", "Number of timesteps between statistics collection", NULL,
@@ -203,11 +149,6 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, SimpleBC
   PetscCall(PetscOptionsViewer("-ts_monitor_wall_force", "Viewer for force on each (no-slip) wall", NULL, &app_ctx->wall_forces.viewer,
                                &app_ctx->wall_forces.viewer_format, NULL));
 
-  // SGS Model Options
-  app_ctx->sgs_model_type = SGS_MODEL_NONE;
-  PetscCall(PetscOptionsEnum("-sgs_model_type", "Subgrid Stress Model type", NULL, SGSModelTypes, (PetscEnum)app_ctx->sgs_model_type,
-                             (PetscEnum *)&app_ctx->sgs_model_type, NULL));
-
   PetscCall(PetscOptionsBool("-diff_filter_monitor", "Enable differential filtering TSMonitor", NULL, app_ctx->diff_filter_monitor,
                              &app_ctx->diff_filter_monitor, NULL));
 
@@ -216,9 +157,6 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, SimpleBC
   PetscCall(PetscOptionsEnum("-mesh_transform", "Mesh transform to perform", NULL, MeshTransformTypes, (PetscEnum)app_ctx->mesh_transform_type,
                              (PetscEnum *)&app_ctx->mesh_transform_type, NULL));
 
-  PetscCall(
-      PetscOptionsBool("-sgs_train_enable", "Enable Data-Driven SGS training", NULL, app_ctx->sgs_train_enable, &app_ctx->sgs_train_enable, NULL));
-
   PetscOptionsEnd();
   PetscFunctionReturn(PETSC_SUCCESS);
 }
diff --git a/examples/fluids/src/differential_filter.c b/examples/fluids/src/differential_filter.c
index 414c7154f8..c3f1478867 100644
--- a/examples/fluids/src/differential_filter.c
+++ b/examples/fluids/src/differential_filter.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,6 +8,7 @@
 /// Functions for setting up and performing differential filtering
 
 #include "../qfunctions//differential_filter.h"
+#include <ceed.h>
 
 #include <petscdmplex.h>
 
@@ -36,8 +37,9 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
       case STATEVAR_CONSERVATIVE:
         PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, DifferentialFilter_RHS_Conserv, DifferentialFilter_RHS_Conserv_loc, &qf_rhs));
         break;
-      default:
-        SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "Differential filtering not available for chosen state variable");
+      case STATEVAR_ENTROPY:
+        PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, DifferentialFilter_RHS_Entropy, DifferentialFilter_RHS_Entropy_loc, &qf_rhs));
+        break;
     }
     if (diff_filter->do_mms_test) {
       PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_rhs));
@@ -69,6 +71,9 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
 
       PetscCall(PetscSNPrintf(field_name, PETSC_MAX_PATH_LEN, "v%" PetscInt_FMT, dm_field));
       PetscCallCeed(ceed, CeedOperatorSetField(op_rhs, field_name, elem_restr_filter, basis_filter, CEED_VECTOR_ACTIVE));
+
+      PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_filter));
+      PetscCallCeed(ceed, CeedBasisDestroy(&basis_filter));
     }
 
     PetscCall(OperatorApplyContextCreate(user->dm, dm_filter, ceed, op_rhs, NULL, NULL, user->Q_loc, NULL, &diff_filter->op_rhs_ctx));
@@ -91,7 +96,7 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
     // -- Get Grid anisotropy tensor
     PetscCall(GridAnisotropyTensorCalculateCollocatedVector(ceed, user, ceed_data, &elem_restr_grid_aniso, &grid_aniso_ceed, &num_comp_grid_aniso));
 
-    PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &op_lhs));
+    PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &op_lhs));
     for (PetscInt i = 0; i < diff_filter->num_filtered_fields; i++) {
       CeedQFunction       qf_lhs;
       PetscInt            num_comp_filter = diff_filter->num_field_components[i];
@@ -132,8 +137,7 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
         char              field_name[PETSC_MAX_PATH_LEN];
         PetscCall(PetscSNPrintf(field_name, PETSC_MAX_PATH_LEN, "v%" PetscInt_FMT, i));
         PetscCallCeed(ceed, CeedOperatorGetFieldByName(diff_filter->op_rhs_ctx->op, field_name, &op_field));
-        PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_filter));
-        PetscCallCeed(ceed, CeedOperatorFieldGetBasis(op_field, &basis_filter));
+        PetscCallCeed(ceed, CeedOperatorFieldGetData(op_field, NULL, &elem_restr_filter, &basis_filter, NULL));
       }
 
       PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_lhs, NULL, NULL, &op_lhs_sub));
@@ -145,12 +149,17 @@ PetscErrorCode DifferentialFilterCreateOperators(Ceed ceed, User user, CeedData
       PetscCallCeed(ceed, CeedOperatorSetField(op_lhs_sub, "v", elem_restr_filter, basis_filter, CEED_VECTOR_ACTIVE));
       PetscCallCeed(ceed, CeedOperatorSetField(op_lhs_sub, "Grad_v", elem_restr_filter, basis_filter, CEED_VECTOR_ACTIVE));
 
-      PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_lhs, op_lhs_sub));
+      PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_lhs, op_lhs_sub));
+      PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_filter));
+      PetscCallCeed(ceed, CeedBasisDestroy(&basis_filter));
       PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_lhs));
       PetscCallCeed(ceed, CeedOperatorDestroy(&op_lhs_sub));
     }
+    PetscCallCeed(ceed, CeedVectorDestroy(&grid_aniso_ceed));
+    PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_grid_aniso));
+
     PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_lhs, "filter width scaling", &diff_filter->filter_width_scaling_label));
-    PetscCall(MatCeedCreate(dm_filter, dm_filter, op_lhs, NULL, &mat_lhs));
+    PetscCall(MatCreateCeed(dm_filter, dm_filter, op_lhs, NULL, &mat_lhs));
 
     PetscCall(KSPCreate(PetscObjectComm((PetscObject)dm_filter), &diff_filter->ksp));
     PetscCall(KSPSetOptionsPrefix(diff_filter->ksp, "diff_filter_"));
@@ -264,9 +273,10 @@ PetscErrorCode DifferentialFilterSetup(Ceed ceed, User user, CeedData ceed_data,
   PetscCallCeed(ceed, CeedQFunctionContextCreate(ceed, &diff_filter_qfctx));
   PetscCallCeed(ceed, CeedQFunctionContextSetData(diff_filter_qfctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*diff_filter_ctx), diff_filter_ctx));
   PetscCallCeed(ceed, CeedQFunctionContextSetDataDestroy(diff_filter_qfctx, CEED_MEM_HOST, FreeContextPetsc));
-  PetscCallCeed(ceed, CeedQFunctionContextRegisterDouble(
-                          diff_filter_qfctx, "filter width scaling", offsetof(struct DifferentialFilterContext_, width_scaling),
-                          sizeof(diff_filter_ctx->width_scaling) / sizeof(diff_filter_ctx->width_scaling[0]), "Filter width scaling"));
+  PetscCallCeed(ceed, CeedQFunctionContextRegisterDouble(diff_filter_qfctx, "filter width scaling",
+                                                         offsetof(struct DifferentialFilterContext_, width_scaling),
+                                                         sizeof(diff_filter_ctx->width_scaling) / sizeof(diff_filter_ctx->width_scaling[0]),
+                                                         "Filter width scaling"));
 
   // -- Setup Operators
   PetscCall(DifferentialFilterCreateOperators(ceed, user, ceed_data, diff_filter_qfctx));
diff --git a/examples/fluids/src/dm_utils.c b/examples/fluids/src/dm_utils.c
index 074240fbfc..b7a1bf8ea7 100644
--- a/examples/fluids/src/dm_utils.c
+++ b/examples/fluids/src/dm_utils.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -67,8 +67,8 @@ PetscErrorCode DMPlexCeedElemRestrictionCreate(Ceed ceed, DM dm, DMLabel domain_
   CeedInt *restriction_offsets_ceed = NULL;
 
   PetscFunctionBeginUser;
-  PetscCall(
-      DMPlexGetLocalOffsets(dm, domain_label, label_value, height, dm_field, &num_elem, &elem_size, &num_comp, &num_dof, &restriction_offsets_petsc));
+  PetscCall(DMPlexGetLocalOffsets(dm, domain_label, label_value, height, dm_field, &num_elem, &elem_size, &num_comp, &num_dof,
+                                  &restriction_offsets_petsc));
   PetscCall(IntArrayPetscToCeed(num_elem * elem_size, &restriction_offsets_petsc, &restriction_offsets_ceed));
   PetscCallCeed(ceed, CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp, 1, num_dof, CEED_MEM_HOST, CEED_COPY_VALUES,
                                                 restriction_offsets_ceed, restriction));
diff --git a/examples/fluids/src/grid_anisotropy_tensor.c b/examples/fluids/src/grid_anisotropy_tensor.c
index 15692ee7d6..8e5ffecb49 100644
--- a/examples/fluids/src/grid_anisotropy_tensor.c
+++ b/examples/fluids/src/grid_anisotropy_tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -75,7 +75,7 @@ PetscErrorCode GridAnisotropyTensorProjectionSetupApply(Ceed ceed, User user, Ce
   {  // -- Setup KSP for L^2 projection
     Mat mat_mass;
 
-    PetscCall(MatCeedCreate(grid_aniso_proj->dm, grid_aniso_proj->dm, op_mass, NULL, &mat_mass));
+    PetscCall(MatCreateCeed(grid_aniso_proj->dm, grid_aniso_proj->dm, op_mass, NULL, &mat_mass));
 
     PetscCall(KSPCreate(comm, &ksp));
     PetscCall(KSPSetOptionsPrefix(ksp, "grid_anisotropy_tensor_projection_"));
diff --git a/examples/fluids/src/inverse_multiplicity.c b/examples/fluids/src/inverse_multiplicity.c
index 2d71cc15fe..0e8cb90cbf 100644
--- a/examples/fluids/src/inverse_multiplicity.c
+++ b/examples/fluids/src/inverse_multiplicity.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/fluids/src/log_events.c b/examples/fluids/src/log_events.c
new file mode 100644
index 0000000000..d67b312250
--- /dev/null
+++ b/examples/fluids/src/log_events.c
@@ -0,0 +1,36 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <log_events.h>
+#include <petsc.h>
+
+static PetscClassId libCEED_classid, misc_classid;
+
+PetscLogEvent FLUIDS_CeedOperatorApply;
+PetscLogEvent FLUIDS_CeedOperatorAssemble;
+PetscLogEvent FLUIDS_CeedOperatorAssembleDiagonal;
+PetscLogEvent FLUIDS_CeedOperatorAssemblePointBlockDiagonal;
+PetscLogEvent FLUIDS_SmartRedis_Init;
+PetscLogEvent FLUIDS_SmartRedis_Meta;
+PetscLogEvent FLUIDS_SmartRedis_Train;
+PetscLogEvent FLUIDS_TrainDataCompute;
+PetscLogEvent FLUIDS_DifferentialFilter;
+PetscLogEvent FLUIDS_VelocityGradientProjection;
+
+PetscErrorCode RegisterLogEvents() {
+  PetscFunctionBeginUser;
+  PetscCall(PetscClassIdRegister("libCEED", &libCEED_classid));
+  PetscCall(PetscLogEventRegister("CeedOpApply", libCEED_classid, &FLUIDS_CeedOperatorApply));
+  PetscCall(PetscLogEventRegister("CeedOpAsm", libCEED_classid, &FLUIDS_CeedOperatorAssemble));
+  PetscCall(PetscLogEventRegister("CeedOpAsmD", libCEED_classid, &FLUIDS_CeedOperatorAssembleDiagonal));
+  PetscCall(PetscLogEventRegister("CeedOpAsmPBD", libCEED_classid, &FLUIDS_CeedOperatorAssemblePointBlockDiagonal));
+
+  PetscCall(PetscClassIdRegister("Miscellaneous", &misc_classid));
+  PetscCall(PetscLogEventRegister("DiffFilter", misc_classid, &FLUIDS_DifferentialFilter));
+  PetscCall(PetscLogEventRegister("VeloGradProj", misc_classid, &FLUIDS_VelocityGradientProjection));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
diff --git a/examples/fluids/src/mat-ceed.c b/examples/fluids/src/mat-ceed.c
index 246df8779c..5e8ebef86f 100644
--- a/examples/fluids/src/mat-ceed.c
+++ b/examples/fluids/src/mat-ceed.c
@@ -1,17 +1,23 @@
 /// @file
-/// MatCeed and it's related operators
+/// MatCEED implementation
 
-#include <ceed-utils.h>
 #include <ceed.h>
 #include <ceed/backend.h>
 #include <mat-ceed-impl.h>
 #include <mat-ceed.h>
-#include <petscdmplex.h>
+#include <petsc-ceed-utils.h>
+#include <petsc-ceed.h>
+#include <petscdm.h>
+#include <petscmat.h>
+#include <stdbool.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 PetscClassId  MATCEED_CLASSID;
-PetscLogEvent MATCEED_MULT, MATCEED_MULT_TRANSPOSE;
+PetscLogEvent MATCEED_MULT, MATCEED_MULT_CEEDOP, MATCEED_MULT_TRANSPOSE, MATCEED_MULT_TRANSPOSE_CEEDOP, MATCEED_ASSEMBLE_DIAGONAL,
+    MATCEED_ASSEMBLE_DIAGONAL_CEEDOP, MATCEED_SETUP_PBDIAGONAL, MATCEED_SETUP_PBDIAGONAL_CEEDOP, MATCEED_ASSEMBLE_PBDIAGONAL,
+    MATCEED_ASSEMBLE_PBDIAGONAL_CEEDOP, MATCEED_SETUP_FULL, MATCEED_SETUP_FULL_CEEDOP, MATCEED_ASSEMBLE_FULL, MATCEED_ASSEMBLE_FULL_CEEDOP;
 
 /**
   @brief Register MATCEED log events.
@@ -21,67 +27,25 @@ PetscLogEvent MATCEED_MULT, MATCEED_MULT_TRANSPOSE;
   @return An error code: 0 - success, otherwise - failure
 **/
 static PetscErrorCode MatCeedRegisterLogEvents() {
-  static bool registered = false;
+  static PetscBool registered = PETSC_FALSE;
 
   PetscFunctionBeginUser;
   if (registered) PetscFunctionReturn(PETSC_SUCCESS);
-  PetscCall(PetscClassIdRegister("MATCEED", &MATCEED_CLASSID));
-  PetscCall(PetscLogEventRegister("MATCEED Mult", MATCEED_CLASSID, &MATCEED_MULT));
-  PetscCall(PetscLogEventRegister("MATCEED Mult Transpose", MATCEED_CLASSID, &MATCEED_MULT_TRANSPOSE));
-  registered = true;
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-/**
-  @brief Setup inner `Mat` for `PC` operations not directly supported by libCEED.
-
-  Collective across MPI processes.
-
-  @param[in]   mat_ceed   `MATCEED` to setup
-  @param[out]  mat_inner  Inner `Mat`
-
-  @return An error code: 0 - success, otherwise - failure
-**/
-static PetscErrorCode MatCeedSetupInnerMat(Mat mat_ceed, Mat *mat_inner) {
-  MatCeedContext ctx;
-
-  PetscFunctionBeginUser;
-  PetscCall(MatShellGetContext(mat_ceed, &ctx));
-
-  PetscCheck(ctx->dm_x == ctx->dm_y, PetscObjectComm((PetscObject)mat_ceed), PETSC_ERR_SUP, "PC only supported for MATCEED on a single DM");
-
-  // Check cl mat type
-  {
-    PetscBool is_internal_mat_type_cl = PETSC_FALSE;
-    char      internal_mat_type_cl[64];
-
-    // Check for specific CL inner mat type for this Mat
-    {
-      const char *mat_ceed_prefix = NULL;
-
-      PetscCall(MatGetOptionsPrefix(mat_ceed, &mat_ceed_prefix));
-      PetscOptionsBegin(PetscObjectComm((PetscObject)mat_ceed), mat_ceed_prefix, "", NULL);
-      PetscCall(PetscOptionsFList("-ceed_inner_mat_type", "MATCEED inner assembled MatType for PC support", NULL, MatList, internal_mat_type_cl,
-                                  internal_mat_type_cl, sizeof(internal_mat_type_cl), &is_internal_mat_type_cl));
-      PetscOptionsEnd();
-      if (is_internal_mat_type_cl) {
-        PetscCall(PetscFree(ctx->internal_mat_type));
-        PetscCall(PetscStrallocpy(internal_mat_type_cl, &ctx->internal_mat_type));
-      }
-    }
-  }
-
-  // Create sparse matrix
-  {
-    MatType dm_mat_type, dm_mat_type_copy;
-
-    PetscCall(DMGetMatType(ctx->dm_x, &dm_mat_type));
-    PetscCall(PetscStrallocpy(dm_mat_type, (char **)&dm_mat_type_copy));
-    PetscCall(DMSetMatType(ctx->dm_x, ctx->internal_mat_type));
-    PetscCall(DMCreateMatrix(ctx->dm_x, mat_inner));
-    PetscCall(DMSetMatType(ctx->dm_x, dm_mat_type_copy));
-    PetscCall(PetscFree(dm_mat_type_copy));
-  }
+  PetscCall(PetscClassIdRegister("MatCEED", &MATCEED_CLASSID));
+  PetscCall(PetscLogEventRegister("MatCEEDMul", MATCEED_CLASSID, &MATCEED_MULT));
+  PetscCall(PetscLogEventRegister("MatCEEDMulCeed", MATCEED_CLASSID, &MATCEED_MULT_CEEDOP));
+  PetscCall(PetscLogEventRegister("MatCEEDMulT", MATCEED_CLASSID, &MATCEED_MULT_TRANSPOSE));
+  PetscCall(PetscLogEventRegister("MatCEEDMulTCeed", MATCEED_CLASSID, &MATCEED_MULT_TRANSPOSE_CEEDOP));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmDiag", MATCEED_CLASSID, &MATCEED_ASSEMBLE_DIAGONAL));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmPBDSU", MATCEED_CLASSID, &MATCEED_SETUP_PBDIAGONAL));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmPBDSUCeed", MATCEED_CLASSID, &MATCEED_SETUP_PBDIAGONAL_CEEDOP));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmPBD", MATCEED_CLASSID, &MATCEED_ASSEMBLE_PBDIAGONAL));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmPBDCeed", MATCEED_CLASSID, &MATCEED_ASSEMBLE_PBDIAGONAL_CEEDOP));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmSU", MATCEED_CLASSID, &MATCEED_SETUP_FULL));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmSUCeed", MATCEED_CLASSID, &MATCEED_SETUP_FULL_CEEDOP));
+  PetscCall(PetscLogEventRegister("MatCEEDAsm", MATCEED_CLASSID, &MATCEED_ASSEMBLE_FULL));
+  PetscCall(PetscLogEventRegister("MatCEEDAsmCeed", MATCEED_CLASSID, &MATCEED_ASSEMBLE_FULL_CEEDOP));
+  registered = PETSC_TRUE;
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -117,12 +81,15 @@ static PetscErrorCode MatCeedAssemblePointBlockDiagonalCOO(Mat mat_ceed, Mat mat
       PetscLogStage stage_amg_setup;
 
       // -- Assemble sparsity pattern if mat hasn't been assembled before
-      PetscCall(PetscLogStageGetId("MATCEED Assembly Setup", &stage_amg_setup));
+      PetscCall(PetscLogStageGetId("MatCEED Asm Setup", &stage_amg_setup));
       if (stage_amg_setup == -1) {
-        PetscCall(PetscLogStageRegister("MATCEED Assembly Setup", &stage_amg_setup));
+        PetscCall(PetscLogStageRegister("MatCEED Asm Setup", &stage_amg_setup));
       }
       PetscCall(PetscLogStagePush(stage_amg_setup));
+      PetscCall(PetscLogEventBegin(MATCEED_SETUP_PBDIAGONAL, mat_ceed, mat_coo, NULL, NULL));
+      PetscCall(PetscLogEventBegin(MATCEED_SETUP_PBDIAGONAL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
       PetscCallCeed(ctx->ceed, CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(ctx->op_mult, &num_entries, &rows_ceed, &cols_ceed));
+      PetscCall(PetscLogEventEnd(MATCEED_SETUP_PBDIAGONAL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
       PetscCall(IntArrayCeedToPetsc(num_entries, &rows_ceed, &rows_petsc));
       PetscCall(IntArrayCeedToPetsc(num_entries, &cols_ceed, &cols_petsc));
       PetscCall(MatSetPreallocationCOOLocal(mat_coo, num_entries, rows_petsc, cols_petsc));
@@ -131,11 +98,13 @@ static PetscErrorCode MatCeedAssemblePointBlockDiagonalCOO(Mat mat_ceed, Mat mat
       if (!ctx->coo_values_pbd) PetscCallCeed(ctx->ceed, CeedVectorCreate(ctx->ceed, num_entries, &ctx->coo_values_pbd));
       PetscCall(PetscRealloc(++ctx->num_mats_assembled_pbd * sizeof(Mat), &ctx->mats_assembled_pbd));
       ctx->mats_assembled_pbd[ctx->num_mats_assembled_pbd - 1] = mat_coo;
+      PetscCall(PetscLogEventEnd(MATCEED_SETUP_PBDIAGONAL, mat_ceed, mat_coo, NULL, NULL));
       PetscCall(PetscLogStagePop());
     }
   }
 
   // Assemble mat_ceed
+  PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_PBDIAGONAL, mat_ceed, mat_coo, NULL, NULL));
   PetscCall(MatAssemblyBegin(mat_coo, MAT_FINAL_ASSEMBLY));
   {
     const CeedScalar *values;
@@ -148,7 +117,9 @@ static PetscErrorCode MatCeedAssemblePointBlockDiagonalCOO(Mat mat_ceed, Mat mat
     else if (strstr(mat_type, "kokkos")) mem_type = CEED_MEM_DEVICE;
     else mem_type = CEED_MEM_HOST;
 
+    PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_PBDIAGONAL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
     PetscCallCeed(ctx->ceed, CeedOperatorLinearAssemblePointBlockDiagonal(ctx->op_mult, ctx->coo_values_pbd, CEED_REQUEST_IMMEDIATE));
+    PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_PBDIAGONAL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
     PetscCallCeed(ctx->ceed, CeedVectorGetArrayRead(ctx->coo_values_pbd, mem_type, &values));
     PetscCall(MatSetValuesCOO(mat_coo, values, INSERT_VALUES));
     PetscCall(MatIsSPDKnown(mat_ceed, &is_spd_known, &is_spd));
@@ -156,6 +127,7 @@ static PetscErrorCode MatCeedAssemblePointBlockDiagonalCOO(Mat mat_ceed, Mat mat
     PetscCallCeed(ctx->ceed, CeedVectorRestoreArrayRead(ctx->coo_values_pbd, &values));
   }
   PetscCall(MatAssemblyEnd(mat_coo, MAT_FINAL_ASSEMBLY));
+  PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_PBDIAGONAL, mat_ceed, mat_coo, NULL, NULL));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -177,14 +149,14 @@ static PetscErrorCode MatCeedAssembleInnerBlockDiagonalMat(Mat mat_ceed, PetscBo
   PetscCall(MatShellGetContext(mat_ceed, &ctx));
   if (use_ceed_pbd) {
     // Check if COO pattern set
-    if (!ctx->mat_assembled_pbd_internal) PetscCall(MatCeedSetupInnerMat(mat_ceed, &ctx->mat_assembled_pbd_internal));
+    if (!ctx->mat_assembled_pbd_internal) PetscCall(MatCeedCreateMatCOO(mat_ceed, &ctx->mat_assembled_pbd_internal));
 
     // Assemble mat_assembled_full_internal
     PetscCall(MatCeedAssemblePointBlockDiagonalCOO(mat_ceed, ctx->mat_assembled_pbd_internal));
     if (mat_inner) *mat_inner = ctx->mat_assembled_pbd_internal;
   } else {
     // Check if COO pattern set
-    if (!ctx->mat_assembled_full_internal) PetscCall(MatCeedSetupInnerMat(mat_ceed, &ctx->mat_assembled_full_internal));
+    if (!ctx->mat_assembled_full_internal) PetscCall(MatCeedCreateMatCOO(mat_ceed, &ctx->mat_assembled_full_internal));
 
     // Assemble mat_assembled_full_internal
     PetscCall(MatCeedAssembleCOO(mat_ceed, ctx->mat_assembled_full_internal));
@@ -194,79 +166,134 @@ static PetscErrorCode MatCeedAssembleInnerBlockDiagonalMat(Mat mat_ceed, PetscBo
 }
 
 /**
-  @brief Get `MATCEED` diagonal block for Jacobi.
+  @brief Get `MATCEED` variable block diagonal for Jacobi.
 
   Collective across MPI processes.
 
   @param[in]   mat_ceed   `MATCEED` to invert
-  @param[out]  mat_block  The diagonal block matrix
+  @param[out]  mat_vblock  The variable diagonal block matrix
 
   @return An error code: 0 - success, otherwise - failure
 **/
-static PetscErrorCode MatGetDiagonalBlock_Ceed(Mat mat_ceed, Mat *mat_block) {
-  Mat            mat_inner = NULL;
+static PetscErrorCode MatGetVariableBlockDiagonal_Ceed(Mat mat_ceed, Mat *mat_vblock) {
   MatCeedContext ctx;
 
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(mat_ceed, &ctx));
 
   // Assemble inner mat if needed
-  PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_pbd_valid, &mat_inner));
-
-  // Get block diagonal
-  PetscCall(MatGetDiagonalBlock(mat_inner, mat_block));
+  PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_vpbd_valid, mat_vblock));
+  PetscCall(PetscObjectReference((PetscObject)*mat_vblock));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
 /**
-  @brief Invert `MATCEED` diagonal block for Jacobi.
+  @brief Get `MATCEED` block diagonal for Jacobi.
 
   Collective across MPI processes.
 
-  @param[in]   mat_ceed  `MATCEED` to invert
-  @param[out]  values    The block inverses in column major order
+  @param[in]   mat_ceed   `MATCEED` to invert
+  @param[out]  mat_block  The variable diagonal block matrix
 
   @return An error code: 0 - success, otherwise - failure
 **/
-static PetscErrorCode MatInvertBlockDiagonal_Ceed(Mat mat_ceed, const PetscScalar **values) {
-  Mat            mat_inner = NULL;
+static PetscErrorCode MatGetBlockDiagonal_Ceed(Mat mat_ceed, Mat *mat_block) {
   MatCeedContext ctx;
 
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(mat_ceed, &ctx));
 
   // Assemble inner mat if needed
-  PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_pbd_valid, &mat_inner));
-
-  // Invert PB diagonal
-  PetscCall(MatInvertBlockDiagonal(mat_inner, values));
+  PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_pbd_valid, mat_block));
+  PetscCall(PetscObjectReference((PetscObject)*mat_block));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
 /**
-  @brief Invert `MATCEED` variable diagonal block for Jacobi.
+  @brief Get on-process diagonal block of `MATCEED`
+
+  This is a block per-process of the diagonal of the global matrix.
+  This is NOT the diagonal blocks associated with the block size of the matrix (i.e. `MatSetBlockSize()` has no effect on this function).
 
   Collective across MPI processes.
 
-  @param[in]   mat_ceed     `MATCEED` to invert
-  @param[in]   num_blocks   The number of blocks on the process
-  @param[in]   block_sizes  The size of each block on the process
-  @param[out]  values       The block inverses in column major order
+  @param[in]   mat_ceed   `MATCEED` to invert
+  @param[out]  mat_block  The diagonal block matrix
 
   @return An error code: 0 - success, otherwise - failure
 **/
-static PetscErrorCode MatInvertVariableBlockDiagonal_Ceed(Mat mat_ceed, PetscInt num_blocks, const PetscInt *block_sizes, PetscScalar *values) {
-  Mat            mat_inner = NULL;
+static PetscErrorCode MatGetDiagonalBlock_Ceed(Mat mat_ceed, Mat *mat_block) {
   MatCeedContext ctx;
 
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(mat_ceed, &ctx));
 
-  // Assemble inner mat if needed
-  PetscCall(MatCeedAssembleInnerBlockDiagonalMat(mat_ceed, ctx->is_ceed_vpbd_valid, &mat_inner));
+  // Check if COO pattern set
+  if (!ctx->mat_assembled_full_internal) PetscCall(MatCeedCreateMatCOO(mat_ceed, &ctx->mat_assembled_full_internal));
+
+  // Assemble mat_assembled_full_internal
+  PetscCall(MatCeedAssembleCOO(mat_ceed, ctx->mat_assembled_full_internal));
+
+  // Get diagonal block
+  PetscCall(MatGetDiagonalBlock(ctx->mat_assembled_full_internal, mat_block));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief View `MATCEED`.
+
+  Collective across MPI processes.
+
+  @param[in]   mat_ceed  `MATCEED` to view
+  @param[in]   viewer    The visualization context
 
-  // Invert PB diagonal
-  PetscCall(MatInvertVariableBlockDiagonal(mat_inner, num_blocks, block_sizes, values));
+  @return An error code: 0 - success, otherwise - failure
+**/
+static PetscErrorCode MatView_Ceed(Mat mat_ceed, PetscViewer viewer) {
+  PetscBool         is_ascii;
+  PetscViewerFormat format;
+  PetscMPIInt       size, rank;
+  MatCeedContext    ctx;
+
+  PetscFunctionBeginUser;
+  PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
+  PetscCall(MatShellGetContext(mat_ceed, &ctx));
+  if (!viewer) PetscCall(PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)mat_ceed), &viewer));
+
+  PetscCall(PetscViewerGetFormat(viewer, &format));
+  PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat_ceed), &size));
+  if (size == 1 && format == PETSC_VIEWER_LOAD_BALANCE) PetscFunctionReturn(PETSC_SUCCESS);
+
+  PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat_ceed), &rank));
+  if (rank != 0) PetscFunctionReturn(PETSC_SUCCESS);
+
+  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &is_ascii));
+  {
+    PetscBool is_detailed     = format == PETSC_VIEWER_ASCII_INFO_DETAIL;
+    char      rank_string[16] = {'\0'};
+    FILE     *file;
+
+    PetscCall(PetscViewerASCIIPrintf(viewer, "MatCEED:\n"));
+    PetscCall(PetscViewerASCIIPushTab(viewer));  // MatCEED
+    PetscCall(PetscViewerASCIIPrintf(viewer, "Default COO MatType: %s\n", ctx->coo_mat_type));
+    PetscCall(PetscSNPrintf(rank_string, 16, "on Rank %d", rank));
+    PetscCall(PetscViewerASCIIPrintf(viewer, "CeedOperator Apply %s:\n", is_detailed ? rank_string : "Summary"));
+    PetscCall(PetscViewerASCIIPrintf(viewer, "libCEED PB Diagonal Assembly: %s\n", ctx->is_ceed_pbd_valid ? "True" : "False"));
+    PetscCall(PetscViewerASCIIPrintf(viewer, "libCEED VPB Diagonal Assembly: %s\n", ctx->is_ceed_vpbd_valid ? "True" : "False"));
+    PetscCall(PetscViewerASCIIGetPointer(viewer, &file));
+    PetscCall(PetscViewerASCIIPushTab(viewer));  // CeedOperator
+    if (is_detailed) PetscCallCeed(ctx->ceed, CeedOperatorView(ctx->op_mult, file));
+    else PetscCallCeed(ctx->ceed, CeedOperatorViewTerse(ctx->op_mult, file));
+    PetscCall(PetscViewerASCIIPopTab(viewer));  // CeedOperator
+    if (ctx->op_mult_transpose) {
+      PetscCall(PetscViewerASCIIPrintf(viewer, "CeedOperator ApplyTranspose %s:\n", is_detailed ? rank_string : "Summary"));
+      PetscCall(PetscViewerASCIIPushTab(viewer));  // CeedOperator
+      if (is_detailed) PetscCallCeed(ctx->ceed, CeedOperatorView(ctx->op_mult_transpose, file));
+      else PetscCallCeed(ctx->ceed, CeedOperatorViewTerse(ctx->op_mult_transpose, file));
+      PetscCall(PetscViewerASCIIPopTab(viewer));  // CeedOperator
+    }
+    PetscCall(PetscViewerASCIIPopTab(viewer));  // MatCEED
+  }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -287,7 +314,7 @@ static PetscErrorCode MatInvertVariableBlockDiagonal_Ceed(Mat mat_ceed, PetscInt
 
   @return An error code: 0 - success, otherwise - failure
 **/
-PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat) {
+PetscErrorCode MatCreateCeed(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperator op_mult_transpose, Mat *mat) {
   PetscInt       X_l_size, X_g_size, Y_l_size, Y_g_size;
   VecType        vec_type;
   MatCeedContext ctx;
@@ -317,6 +344,7 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato
     Y_g_size = X_g_size;
     Y_l_size = X_l_size;
   }
+
   // Create context
   {
     Vec X_loc, Y_loc_transpose = NULL;
@@ -327,7 +355,8 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato
       PetscCall(DMCreateLocalVector(dm_y, &Y_loc_transpose));
       PetscCall(VecZeroEntries(Y_loc_transpose));
     }
-    PetscCall(MatCeedContextCreate(dm_x, dm_y, X_loc, Y_loc_transpose, op_mult, op_mult_transpose, MATCEED_MULT, MATCEED_MULT_TRANSPOSE, &ctx));
+    PetscCall(MatCeedContextCreate(dm_x, dm_y, X_loc, Y_loc_transpose, op_mult, op_mult_transpose, MATCEED_MULT, MATCEED_MULT_TRANSPOSE,
+                                   MATCEED_MULT_CEEDOP, MATCEED_MULT_TRANSPOSE_CEEDOP, &ctx));
     PetscCall(VecDestroy(&X_loc));
     PetscCall(VecDestroy(&Y_loc_transpose));
   }
@@ -377,8 +406,8 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato
           CeedInt       num_sub_operators;
           CeedOperator *sub_operators;
 
-          PetscCallCeed(ctx->ceed, CeedCompositeOperatorGetNumSub(op_mult, &num_sub_operators));
-          PetscCallCeed(ctx->ceed, CeedCompositeOperatorGetSubList(op_mult, &sub_operators));
+          PetscCallCeed(ctx->ceed, CeedOperatorCompositeGetNumSub(op_mult, &num_sub_operators));
+          PetscCallCeed(ctx->ceed, CeedOperatorCompositeGetSubList(op_mult, &sub_operators));
           for (CeedInt i = 0; i < num_sub_operators; i++) {
             CeedInt                  num_bases, num_comp;
             CeedBasis               *active_bases;
@@ -428,22 +457,23 @@ PetscErrorCode MatCeedCreate(DM dm_x, DM dm_y, CeedOperator op_mult, CeedOperato
   // -- Set internal mat type
   {
     VecType vec_type;
-    MatType internal_mat_type = MATAIJ;
+    MatType coo_mat_type;
 
     PetscCall(VecGetType(ctx->X_loc, &vec_type));
-    if (strstr(vec_type, VECCUDA)) internal_mat_type = MATAIJCUSPARSE;
-    else if (strstr(vec_type, VECKOKKOS)) internal_mat_type = MATAIJKOKKOS;
-    else internal_mat_type = MATAIJ;
-    PetscCall(PetscStrallocpy(internal_mat_type, &ctx->internal_mat_type));
+    if (strstr(vec_type, VECCUDA)) coo_mat_type = MATAIJCUSPARSE;
+    else if (strstr(vec_type, VECKOKKOS)) coo_mat_type = MATAIJKOKKOS;
+    else coo_mat_type = MATAIJ;
+    PetscCall(PetscStrallocpy(coo_mat_type, &ctx->coo_mat_type));
   }
   // -- Set mat operations
-  PetscCall(MatShellSetContextDestroy(*mat, (PetscErrorCode(*)(void *))MatCeedContextDestroy));
+  PetscCall(MatShellSetContextDestroy(*mat, (PetscCtxDestroyFn *)MatCeedContextDestroy));
+  PetscCall(MatShellSetOperation(*mat, MATOP_VIEW, (void (*)(void))MatView_Ceed));
   PetscCall(MatShellSetOperation(*mat, MATOP_MULT, (void (*)(void))MatMult_Ceed));
   if (op_mult_transpose) PetscCall(MatShellSetOperation(*mat, MATOP_MULT_TRANSPOSE, (void (*)(void))MatMultTranspose_Ceed));
   PetscCall(MatShellSetOperation(*mat, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiagonal_Ceed));
   PetscCall(MatShellSetOperation(*mat, MATOP_GET_DIAGONAL_BLOCK, (void (*)(void))MatGetDiagonalBlock_Ceed));
-  PetscCall(MatShellSetOperation(*mat, MATOP_INVERT_BLOCK_DIAGONAL, (void (*)(void))MatInvertBlockDiagonal_Ceed));
-  PetscCall(MatShellSetOperation(*mat, MATOP_INVERT_VBLOCK_DIAGONAL, (void (*)(void))MatInvertVariableBlockDiagonal_Ceed));
+  PetscCall(MatShellSetOperation(*mat, MATOP_GET_BLOCK_DIAGONAL, (void (*)(void))MatGetBlockDiagonal_Ceed));
+  PetscCall(MatShellSetOperation(*mat, MATOP_GET_VBLOCK_DIAGONAL, (void (*)(void))MatGetVariableBlockDiagonal_Ceed));
   PetscCall(MatShellSetVecType(*mat, vec_type));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -464,13 +494,16 @@ PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other) {
 
   // Check type compatibility
   {
-    MatType mat_type_ceed, mat_type_other;
+    PetscBool is_matceed = PETSC_FALSE, is_matshell = PETSC_FALSE;
+    MatType   mat_type_ceed, mat_type_other;
 
     PetscCall(MatGetType(mat_ceed, &mat_type_ceed));
-    PetscCheck(!strcmp(mat_type_ceed, MATCEED), PETSC_COMM_SELF, PETSC_ERR_LIB, "mat_ceed must have type " MATCEED);
-    PetscCall(MatGetType(mat_ceed, &mat_type_other));
-    PetscCheck(!strcmp(mat_type_other, MATCEED) || !strcmp(mat_type_other, MATSHELL), PETSC_COMM_SELF, PETSC_ERR_LIB,
-               "mat_other must have type " MATCEED " or " MATSHELL);
+    PetscCall(PetscStrcmp(mat_type_ceed, MATCEED, &is_matceed));
+    PetscCheck(is_matceed, PETSC_COMM_SELF, PETSC_ERR_LIB, "mat_ceed must have type " MATCEED);
+    PetscCall(MatGetType(mat_other, &mat_type_other));
+    PetscCall(PetscStrcmp(mat_type_other, MATCEED, &is_matceed));
+    PetscCall(PetscStrcmp(mat_type_other, MATSHELL, &is_matceed));
+    PetscCheck(is_matceed || is_matshell, PETSC_COMM_SELF, PETSC_ERR_LIB, "mat_other must have type " MATCEED " or " MATSHELL);
   }
 
   // Check dimension compatibility
@@ -499,13 +532,14 @@ PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other) {
     PetscCall(MatShellGetContext(mat_ceed, &ctx));
     PetscCall(MatCeedContextReference(ctx));
     PetscCall(MatShellSetContext(mat_other, ctx));
-    PetscCall(MatShellSetContextDestroy(mat_other, (PetscErrorCode(*)(void *))MatCeedContextDestroy));
+    PetscCall(MatShellSetContextDestroy(mat_other, (PetscCtxDestroyFn *)MatCeedContextDestroy));
+    PetscCall(MatShellSetOperation(mat_other, MATOP_VIEW, (void (*)(void))MatView_Ceed));
     PetscCall(MatShellSetOperation(mat_other, MATOP_MULT, (void (*)(void))MatMult_Ceed));
     if (ctx->op_mult_transpose) PetscCall(MatShellSetOperation(mat_other, MATOP_MULT_TRANSPOSE, (void (*)(void))MatMultTranspose_Ceed));
     PetscCall(MatShellSetOperation(mat_other, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiagonal_Ceed));
     PetscCall(MatShellSetOperation(mat_other, MATOP_GET_DIAGONAL_BLOCK, (void (*)(void))MatGetDiagonalBlock_Ceed));
-    PetscCall(MatShellSetOperation(mat_other, MATOP_INVERT_BLOCK_DIAGONAL, (void (*)(void))MatInvertBlockDiagonal_Ceed));
-    PetscCall(MatShellSetOperation(mat_other, MATOP_INVERT_VBLOCK_DIAGONAL, (void (*)(void))MatInvertVariableBlockDiagonal_Ceed));
+    PetscCall(MatShellSetOperation(mat_other, MATOP_GET_BLOCK_DIAGONAL, (void (*)(void))MatGetBlockDiagonal_Ceed));
+    PetscCall(MatShellSetOperation(mat_other, MATOP_GET_VBLOCK_DIAGONAL, (void (*)(void))MatGetVariableBlockDiagonal_Ceed));
     {
       PetscInt block_size;
 
@@ -525,6 +559,132 @@ PetscErrorCode MatCeedCopy(Mat mat_ceed, Mat mat_other) {
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+/**
+  @brief Mark `CeedQFunction` data as updated and the `CeedQFunction` as requiring re-assembly for a `MatCEED`.
+
+  Collective across MPI processes.
+
+  @param[in]   mat_ceed       `MATCEED`
+  @param[out]  update_needed  Boolean flag indicating `CeedQFunction` update needed
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetAssemblyDataUpdateNeeded(Mat mat_ceed, PetscBool update_needed) {
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat_ceed, &ctx));
+  PetscCallCeed(ctx->ceed, CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(ctx->op_mult, update_needed));
+  if (ctx->op_mult_transpose) {
+    PetscCallCeed(ctx->ceed, CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(ctx->op_mult_transpose, update_needed));
+  }
+  if (update_needed) {
+    PetscCall(MatAssemblyBegin(mat_ceed, MAT_FINAL_ASSEMBLY));
+    PetscCall(MatAssemblyEnd(mat_ceed, MAT_FINAL_ASSEMBLY));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Setup a `Mat` with the same COO pattern as a `MatCEED`.
+
+  Collective across MPI processes.
+
+  @param[in]   mat_ceed  `MATCEED`
+  @param[out]  mat_coo   Sparse `Mat` with same COO pattern
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedCreateMatCOO(Mat mat_ceed, Mat *mat_coo) {
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat_ceed, &ctx));
+
+  PetscCheck(ctx->dm_x == ctx->dm_y, PetscObjectComm((PetscObject)mat_ceed), PETSC_ERR_SUP, "COO assembly only supported for MATCEED on a single DM");
+
+  // Check cl mat type
+  {
+    PetscBool is_coo_mat_type_cl = PETSC_FALSE;
+    char      coo_mat_type_cl[64];
+
+    // Check for specific CL coo mat type for this Mat
+    {
+      const char *mat_ceed_prefix = NULL;
+
+      PetscCall(MatGetOptionsPrefix(mat_ceed, &mat_ceed_prefix));
+      PetscOptionsBegin(PetscObjectComm((PetscObject)mat_ceed), mat_ceed_prefix, "", NULL);
+      PetscCall(PetscOptionsFList("-ceed_coo_mat_type", "Default MATCEED COO assembly MatType", NULL, MatList, coo_mat_type_cl, coo_mat_type_cl,
+                                  sizeof(coo_mat_type_cl), &is_coo_mat_type_cl));
+      PetscOptionsEnd();
+      if (is_coo_mat_type_cl) {
+        PetscCall(PetscFree(ctx->coo_mat_type));
+        PetscCall(PetscStrallocpy(coo_mat_type_cl, &ctx->coo_mat_type));
+      }
+    }
+  }
+
+  // Create sparse matrix
+  {
+    MatType dm_mat_type, dm_mat_type_copy;
+
+    PetscCall(DMGetMatType(ctx->dm_x, &dm_mat_type));
+    PetscCall(PetscStrallocpy(dm_mat_type, (char **)&dm_mat_type_copy));
+    PetscCall(DMSetMatType(ctx->dm_x, ctx->coo_mat_type));
+    PetscCall(DMCreateMatrix(ctx->dm_x, mat_coo));
+    PetscCall(DMSetMatType(ctx->dm_x, dm_mat_type_copy));
+    PetscCall(PetscFree(dm_mat_type_copy));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Setup the COO preallocation `MATCEED` into a `MATAIJ` or similar.
+         The caller is responsible for assuring the global and local sizes are compatible, otherwise this function will fail.
+
+  Collective across MPI processes.
+
+  @param[in]      mat_ceed  `MATCEED` to assemble
+  @param[in,out]  mat_coo   `MATAIJ` or similar to assemble into
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetPreallocationCOO(Mat mat_ceed, Mat mat_coo) {
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat_ceed, &ctx));
+
+  {
+    PetscInt     *rows_petsc = NULL, *cols_petsc = NULL;
+    CeedInt      *rows_ceed, *cols_ceed;
+    PetscCount    num_entries;
+    PetscLogStage stage_amg_setup;
+
+    // -- Assemble sparsity pattern if mat hasn't been assembled before
+    PetscCall(PetscLogStageGetId("MatCEED Asm Setup", &stage_amg_setup));
+    if (stage_amg_setup == -1) {
+      PetscCall(PetscLogStageRegister("MatCEED Asm Setup", &stage_amg_setup));
+    }
+    PetscCall(PetscLogStagePush(stage_amg_setup));
+    PetscCall(PetscLogEventBegin(MATCEED_SETUP_FULL, mat_ceed, mat_coo, NULL, NULL));
+    PetscCall(PetscLogEventBegin(MATCEED_SETUP_FULL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
+    PetscCallCeed(ctx->ceed, CeedOperatorLinearAssembleSymbolic(ctx->op_mult, &num_entries, &rows_ceed, &cols_ceed));
+    PetscCall(PetscLogEventEnd(MATCEED_SETUP_FULL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
+    PetscCall(IntArrayCeedToPetsc(num_entries, &rows_ceed, &rows_petsc));
+    PetscCall(IntArrayCeedToPetsc(num_entries, &cols_ceed, &cols_petsc));
+    PetscCall(MatSetPreallocationCOOLocal(mat_coo, num_entries, rows_petsc, cols_petsc));
+    free(rows_petsc);
+    free(cols_petsc);
+    if (!ctx->coo_values_full) PetscCallCeed(ctx->ceed, CeedVectorCreate(ctx->ceed, num_entries, &ctx->coo_values_full));
+    PetscCall(PetscRealloc(++ctx->num_mats_assembled_full * sizeof(Mat), &ctx->mats_assembled_full));
+    ctx->mats_assembled_full[ctx->num_mats_assembled_full - 1] = mat_coo;
+    PetscCall(PetscLogEventEnd(MATCEED_SETUP_FULL, mat_ceed, mat_coo, NULL, NULL));
+    PetscCall(PetscLogStagePop());
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 /**
   @brief Assemble a `MATCEED` into a `MATAIJ` or similar.
          The `mat_coo` preallocation is set to match the sparsity pattern of `mat_ceed`.
@@ -543,39 +703,18 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) {
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(mat_ceed, &ctx));
 
-  // Check if COO pattern set
+  // Set COO pattern if needed
   {
-    PetscInt index = -1;
+    CeedInt index = -1;
 
     for (PetscInt i = 0; i < ctx->num_mats_assembled_full; i++) {
       if (ctx->mats_assembled_full[i] == mat_coo) index = i;
     }
-    if (index == -1) {
-      PetscInt     *rows_petsc = NULL, *cols_petsc = NULL;
-      CeedInt      *rows_ceed, *cols_ceed;
-      PetscCount    num_entries;
-      PetscLogStage stage_amg_setup;
-
-      // -- Assemble sparsity pattern if mat hasn't been assembled before
-      PetscCall(PetscLogStageGetId("MATCEED Assembly Setup", &stage_amg_setup));
-      if (stage_amg_setup == -1) {
-        PetscCall(PetscLogStageRegister("MATCEED Assembly Setup", &stage_amg_setup));
-      }
-      PetscCall(PetscLogStagePush(stage_amg_setup));
-      PetscCallCeed(ctx->ceed, CeedOperatorLinearAssembleSymbolic(ctx->op_mult, &num_entries, &rows_ceed, &cols_ceed));
-      PetscCall(IntArrayCeedToPetsc(num_entries, &rows_ceed, &rows_petsc));
-      PetscCall(IntArrayCeedToPetsc(num_entries, &cols_ceed, &cols_petsc));
-      PetscCall(MatSetPreallocationCOOLocal(mat_coo, num_entries, rows_petsc, cols_petsc));
-      free(rows_petsc);
-      free(cols_petsc);
-      if (!ctx->coo_values_full) PetscCallCeed(ctx->ceed, CeedVectorCreate(ctx->ceed, num_entries, &ctx->coo_values_full));
-      PetscCall(PetscRealloc(++ctx->num_mats_assembled_full * sizeof(Mat), &ctx->mats_assembled_full));
-      ctx->mats_assembled_full[ctx->num_mats_assembled_full - 1] = mat_coo;
-      PetscCall(PetscLogStagePop());
-    }
+    if (index == -1) PetscCall(MatCeedSetPreallocationCOO(mat_ceed, mat_coo));
   }
 
   // Assemble mat_ceed
+  PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_FULL, mat_ceed, mat_coo, NULL, NULL));
   PetscCall(MatAssemblyBegin(mat_coo, MAT_FINAL_ASSEMBLY));
   {
     const CeedScalar *values;
@@ -588,7 +727,9 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) {
     else if (strstr(mat_type, "kokkos")) mem_type = CEED_MEM_DEVICE;
     else mem_type = CEED_MEM_HOST;
 
+    PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_FULL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
     PetscCallCeed(ctx->ceed, CeedOperatorLinearAssemble(ctx->op_mult, ctx->coo_values_full));
+    PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_FULL_CEEDOP, mat_ceed, mat_coo, NULL, NULL));
     PetscCallCeed(ctx->ceed, CeedVectorGetArrayRead(ctx->coo_values_full, mem_type, &values));
     PetscCall(MatSetValuesCOO(mat_coo, values, INSERT_VALUES));
     PetscCall(MatIsSPDKnown(mat_ceed, &is_spd_known, &is_spd));
@@ -596,6 +737,222 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) {
     PetscCallCeed(ctx->ceed, CeedVectorRestoreArrayRead(ctx->coo_values_full, &values));
   }
   PetscCall(MatAssemblyEnd(mat_coo, MAT_FINAL_ASSEMBLY));
+  PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_FULL, mat_ceed, mat_coo, NULL, NULL));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Set the current value of a context field for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat    `MatCEED`
+  @param[in]      name   Name of the context field
+  @param[in]      value  New context field value
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetContextDouble(Mat mat, const char *name, double value) {
+  PetscBool      was_updated = PETSC_FALSE;
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat, &ctx));
+  {
+    CeedContextFieldLabel label = NULL;
+
+    PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(ctx->op_mult, name, &label));
+    if (label) {
+      double set_value = 2 * value + 1.0;
+
+      PetscCall(MatCeedGetContextDouble(mat, name, &set_value));
+      if (set_value != value) {
+        PetscCallCeed(ctx->ceed, CeedOperatorSetContextDouble(ctx->op_mult, label, &value));
+        was_updated = PETSC_TRUE;
+      }
+    }
+    if (ctx->op_mult_transpose) {
+      label = NULL;
+      PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(ctx->op_mult_transpose, name, &label));
+      if (label) {
+        double set_value = 2 * value + 1.0;
+
+        PetscCall(MatCeedGetContextDouble(mat, name, &set_value));
+        if (set_value != value) {
+          PetscCallCeed(ctx->ceed, CeedOperatorSetContextDouble(ctx->op_mult_transpose, label, &value));
+          was_updated = PETSC_TRUE;
+        }
+      }
+    }
+  }
+  if (was_updated) {
+    PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
+    PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Get the current value of a context field for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in]   mat    `MatCEED`
+  @param[in]   name   Name of the context field
+  @param[out]  value  Current context field value
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedGetContextDouble(Mat mat, const char *name, double *value) {
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat, &ctx));
+  {
+    CeedContextFieldLabel label = NULL;
+    CeedOperator          op    = ctx->op_mult;
+
+    PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(op, name, &label));
+    if (!label && ctx->op_mult_transpose) {
+      op = ctx->op_mult_transpose;
+      PetscCallCeed(ctx->ceed, CeedOperatorGetContextFieldLabel(op, name, &label));
+    }
+    if (label) {
+      PetscSizeT    num_values;
+      const double *values_ceed;
+
+      PetscCallCeed(ctx->ceed, CeedOperatorGetContextDoubleRead(op, label, &num_values, &values_ceed));
+      *value = values_ceed[0];
+      PetscCallCeed(ctx->ceed, CeedOperatorRestoreContextDoubleRead(op, label, &values_ceed));
+    }
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Set the current `PetscReal` value of a context field for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat    `MatCEED`
+  @param[in]      name   Name of the context field
+  @param[in]      value  New context field value
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetContextReal(Mat mat, const char *name, PetscReal value) {
+  double value_double = value;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatCeedSetContextDouble(mat, name, value_double));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Get the current `PetscReal` value of a context field for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in]   mat    `MatCEED`
+  @param[in]   name   Name of the context field
+  @param[out]  value  Current context field value
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedGetContextReal(Mat mat, const char *name, PetscReal *value) {
+  double value_double = 0.0;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatCeedGetContextDouble(mat, name, &value_double));
+  *value = value_double;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Set the current time for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat   `MatCEED`
+  @param[in]      time  Current time
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetTime(Mat mat, PetscReal time) {
+  PetscFunctionBeginUser;
+  {
+    double time_ceed = time;
+
+    PetscCall(MatCeedSetContextDouble(mat, "time", time_ceed));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Get the current time for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in]   mat   `MatCEED`
+  @param[out]  time  Current time, or -1.0 if the boundary evaluator has no time field
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedGetTime(Mat mat, PetscReal *time) {
+  PetscFunctionBeginUser;
+  *time = -1.0;
+  {
+    double time_ceed = -1.0;
+
+    PetscCall(MatCeedGetContextDouble(mat, "time", &time_ceed));
+    *time = time_ceed;
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Set the current time step for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat  `MatCEED`
+  @param[in]      dt   Current time step
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetDt(Mat mat, PetscReal dt) {
+  PetscFunctionBeginUser;
+  {
+    double dt_ceed = dt;
+
+    PetscCall(MatCeedSetContextDouble(mat, "dt", dt_ceed));
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Set the Jacobian shifts for a `MatCEED`.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat      `MatCEED`
+  @param[in]      shift_v  Velocity shift
+  @param[in]      shift_a  Acceleration shift
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetShifts(Mat mat, PetscReal shift_v, PetscReal shift_a) {
+  PetscFunctionBeginUser;
+  {
+    double shift_v_ceed = shift_v;
+
+    PetscCall(MatCeedSetContextDouble(mat, "shift v", shift_v_ceed));
+  }
+  if (shift_a) {
+    double shift_a_ceed = shift_a;
+
+    PetscCall(MatCeedSetContextDouble(mat, "shift a", shift_a_ceed));
+  }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -610,14 +967,14 @@ PetscErrorCode MatCeedAssembleCOO(Mat mat_ceed, Mat mat_coo) {
 
   @return An error code: 0 - success, otherwise - failure
 **/
-PetscErrorCode MatCeedSetContext(Mat mat, PetscErrorCode (*f)(void *), void *ctx) {
+PetscErrorCode MatCeedSetContext(Mat mat, PetscCtxDestroyFn f, void *ctx) {
   PetscContainer user_ctx = NULL;
 
   PetscFunctionBeginUser;
   if (ctx) {
     PetscCall(PetscContainerCreate(PetscObjectComm((PetscObject)mat), &user_ctx));
     PetscCall(PetscContainerSetPointer(user_ctx, ctx));
-    PetscCall(PetscContainerSetUserDestroy(user_ctx, f));
+    PetscCall(PetscContainerSetCtxDestroy(user_ctx, f));
   }
   PetscCall(PetscObjectCompose((PetscObject)mat, "MatCeed user context", (PetscObject)user_ctx));
   PetscCall(PetscContainerDestroy(&user_ctx));
@@ -643,18 +1000,37 @@ PetscErrorCode MatCeedGetContext(Mat mat, void *ctx) {
   else *(void **)ctx = NULL;
   PetscFunctionReturn(PETSC_SUCCESS);
 }
+/**
+  @brief Set a user defined matrix operation for a `MATCEED` matrix.
+
+  Within each user-defined routine, the user should call `MatCeedGetContext()` to obtain the user-defined context that was set by
+`MatCeedSetContext()`.
+
+  Collective across MPI processes.
+
+  @param[in,out]  mat  `MATCEED`
+  @param[in]      op   Name of the `MatOperation`
+  @param[in]      g    Function that provides the operation
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void)) {
+  PetscFunctionBeginUser;
+  PetscCall(MatShellSetOperation(mat, op, g));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
 
 /**
-  @brief Sets the inner matrix type as a string from the `MATCEED`.
+  @brief Sets the default COO matrix type as a string from the `MATCEED`.
 
   Collective across MPI processes.
 
   @param[in,out]  mat   `MATCEED`
-  @param[in]      type  Inner `MatType` to set
+  @param[in]      type  COO `MatType` to set
 
   @return An error code: 0 - success, otherwise - failure
 **/
-PetscErrorCode MatCeedSetInnerMatType(Mat mat, MatType type) {
+PetscErrorCode MatCeedSetCOOMatType(Mat mat, MatType type) {
   MatCeedContext ctx;
 
   PetscFunctionBeginUser;
@@ -664,9 +1040,9 @@ PetscErrorCode MatCeedSetInnerMatType(Mat mat, MatType type) {
     size_t    len_old, len_new;
     PetscBool is_same = PETSC_FALSE;
 
-    PetscCall(PetscStrlen(ctx->internal_mat_type, &len_old));
+    PetscCall(PetscStrlen(ctx->coo_mat_type, &len_old));
     PetscCall(PetscStrlen(type, &len_new));
-    if (len_old == len_new) PetscCall(PetscStrncmp(ctx->internal_mat_type, type, len_old, &is_same));
+    if (len_old == len_new) PetscCall(PetscStrncmp(ctx->coo_mat_type, type, len_old, &is_same));
     if (is_same) PetscFunctionReturn(PETSC_SUCCESS);
   }
   // Clean up old mats in different format
@@ -695,48 +1071,28 @@ PetscErrorCode MatCeedSetInnerMatType(Mat mat, MatType type) {
       }
     }
   }
-  PetscCall(PetscFree(ctx->internal_mat_type));
-  PetscCall(PetscStrallocpy(type, &ctx->internal_mat_type));
+  PetscCall(PetscFree(ctx->coo_mat_type));
+  PetscCall(PetscStrallocpy(type, &ctx->coo_mat_type));
   PetscFunctionReturn(PETSC_SUCCESS);
   // LCOV_EXCL_STOP
 }
 
 /**
-  @brief Gets the inner matrix type as a string from the `MATCEED`.
+  @brief Gets the default COO matrix type as a string from the `MATCEED`.
 
   Collective across MPI processes.
 
   @param[in,out]  mat   `MATCEED`
-  @param[in]      type  Inner `MatType`
+  @param[in]      type  COO `MatType`
 
   @return An error code: 0 - success, otherwise - failure
 **/
-PetscErrorCode MatCeedGetInnerMatType(Mat mat, MatType *type) {
+PetscErrorCode MatCeedGetCOOMatType(Mat mat, MatType *type) {
   MatCeedContext ctx;
 
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(mat, &ctx));
-  *type = ctx->internal_mat_type;
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-/**
-  @brief Set a user defined matrix operation for a `MATCEED` matrix.
-
-  Within each user-defined routine, the user should call `MatCeedGetContext()` to obtain the user-defined context that was set by
-`MatCeedSetContext()`.
-
-  Collective across MPI processes.
-
-  @param[in,out]  mat  `MATCEED`
-  @param[in]      op   Name of the `MatOperation`
-  @param[in]      g    Function that provides the operation
-
-  @return An error code: 0 - success, otherwise - failure
-**/
-PetscErrorCode MatCeedSetOperation(Mat mat, MatOperation op, void (*g)(void)) {
-  PetscFunctionBeginUser;
-  PetscCall(MatShellSetOperation(mat, op, g));
+  *type = ctx->coo_mat_type;
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -763,9 +1119,7 @@ PetscErrorCode MatCeedSetLocalVectors(Mat mat, Vec X_loc, Vec Y_loc_transpose) {
     PetscCall(VecGetSize(X_loc, &len_new));
     PetscCheck(len_old == len_new, PETSC_COMM_SELF, PETSC_ERR_LIB, "new X_loc length %" PetscInt_FMT " should match old X_loc length %" PetscInt_FMT,
                len_new, len_old);
-    PetscCall(VecDestroy(&ctx->X_loc));
-    ctx->X_loc = X_loc;
-    PetscCall(PetscObjectReference((PetscObject)X_loc));
+    PetscCall(VecReferenceCopy(X_loc, &ctx->X_loc));
   }
   if (Y_loc_transpose) {
     PetscInt len_old, len_new;
@@ -774,9 +1128,7 @@ PetscErrorCode MatCeedSetLocalVectors(Mat mat, Vec X_loc, Vec Y_loc_transpose) {
     PetscCall(VecGetSize(Y_loc_transpose, &len_new));
     PetscCheck(len_old == len_new, PETSC_COMM_SELF, PETSC_ERR_LIB,
                "new Y_loc_transpose length %" PetscInt_FMT " should match old Y_loc_transpose length %" PetscInt_FMT, len_new, len_old);
-    PetscCall(VecDestroy(&ctx->Y_loc_transpose));
-    ctx->Y_loc_transpose = Y_loc_transpose;
-    PetscCall(PetscObjectReference((PetscObject)Y_loc_transpose));
+    PetscCall(VecReferenceCopy(Y_loc_transpose, &ctx->Y_loc_transpose));
   }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -798,12 +1150,12 @@ PetscErrorCode MatCeedGetLocalVectors(Mat mat, Vec *X_loc, Vec *Y_loc_transpose)
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(mat, &ctx));
   if (X_loc) {
-    *X_loc = ctx->X_loc;
-    PetscCall(PetscObjectReference((PetscObject)*X_loc));
+    *X_loc = NULL;
+    PetscCall(VecReferenceCopy(ctx->X_loc, X_loc));
   }
   if (Y_loc_transpose) {
-    *Y_loc_transpose = ctx->Y_loc_transpose;
-    PetscCall(PetscObjectReference((PetscObject)*Y_loc_transpose));
+    *Y_loc_transpose = NULL;
+    PetscCall(VecReferenceCopy(ctx->Y_loc_transpose, Y_loc_transpose));
   }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -916,6 +1268,48 @@ PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, Petsc
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+/**
+  @brief Set `CeedOperator` `PetscLogEvent` for `MATCEED` `MatMult()` and `MatMultTranspose()` operators.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat                       MatCeed
+  @param[out]     log_event_mult            `PetscLogEvent` for forward `CeedOperator` evaluation, or NULL
+  @param[out]     log_event_mult_transpose  `PetscLogEvent` for transpose `CeedOperator` evaluation, or NULL
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedSetCeedOperatorLogEvents(Mat mat, PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose) {
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat, &ctx));
+  if (log_event_mult) ctx->log_event_ceed_mult = log_event_mult;
+  if (log_event_mult_transpose) ctx->log_event_ceed_mult_transpose = log_event_mult_transpose;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+  @brief Get `CeedOperator` `PetscLogEvent` for `MATCEED` `MatMult()` and `MatMultTranspose()` operators.
+
+  Not collective across MPI processes.
+
+  @param[in,out]  mat                       MatCeed
+  @param[out]     log_event_mult            `PetscLogEvent` for forward `CeedOperator` evaluation, or NULL
+  @param[out]     log_event_mult_transpose  `PetscLogEvent` for transpose `CeedOperator` evaluation, or NULL
+
+  @return An error code: 0 - success, otherwise - failure
+**/
+PetscErrorCode MatCeedGetCeedOperatorLogEvents(Mat mat, PetscLogEvent *log_event_mult, PetscLogEvent *log_event_mult_transpose) {
+  MatCeedContext ctx;
+
+  PetscFunctionBeginUser;
+  PetscCall(MatShellGetContext(mat, &ctx));
+  if (log_event_mult) *log_event_mult = ctx->log_event_ceed_mult;
+  if (log_event_mult_transpose) *log_event_mult_transpose = ctx->log_event_ceed_mult_transpose;
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 // -----------------------------------------------------------------------------
 // Operator context data
 // -----------------------------------------------------------------------------
@@ -925,20 +1319,23 @@ PetscErrorCode MatCeedGetLogEvents(Mat mat, PetscLogEvent *log_event_mult, Petsc
 
   Collective across MPI processes.
 
-  @param[in]   dm_x                      Input `DM`
-  @param[in]   dm_y                      Output `DM`
-  @param[in]   X_loc                     Input PETSc local vector, or NULL
-  @param[in]   Y_loc_transpose           Input PETSc local vector for transpose operation, or NULL
-  @param[in]   op_mult                   `CeedOperator` for forward evaluation
-  @param[in]   op_mult_transpose         `CeedOperator` for transpose evaluation
-  @param[in]   log_event_mult            `PetscLogEvent` for forward evaluation
-  @param[in]   log_event_mult_transpose  `PetscLogEvent` for transpose evaluation
-  @param[out]  ctx                       Context data for operator evaluation
+  @param[in]   dm_x                           Input `DM`
+  @param[in]   dm_y                           Output `DM`
+  @param[in]   X_loc                          Input PETSc local vector, or NULL
+  @param[in]   Y_loc_transpose                Input PETSc local vector for transpose operation, or NULL
+  @param[in]   op_mult                        `CeedOperator` for forward evaluation
+  @param[in]   op_mult_transpose              `CeedOperator` for transpose evaluation
+  @param[in]   log_event_mult                 `PetscLogEvent` for forward evaluation
+  @param[in]   log_event_mult_transpose       `PetscLogEvent` for transpose evaluation
+  @param[in]   log_event_ceed_mult            `PetscLogEvent` for forward `CeedOperator` evaluation
+  @param[in]   log_event_ceed_mult_transpose  `PetscLogEvent` for transpose `CeedOperator` evaluation
+  @param[out]  ctx                            Context data for operator evaluation
 
   @return An error code: 0 - success, otherwise - failure
 **/
 PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_transpose, CeedOperator op_mult, CeedOperator op_mult_transpose,
-                                    PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose, MatCeedContext *ctx) {
+                                    PetscLogEvent log_event_mult, PetscLogEvent log_event_mult_transpose, PetscLogEvent log_event_ceed_mult,
+                                    PetscLogEvent log_event_ceed_mult_transpose, MatCeedContext *ctx) {
   CeedSize x_loc_len, y_loc_len;
 
   PetscFunctionBeginUser;
@@ -948,18 +1345,16 @@ PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_trans
   (*ctx)->ref_count = 1;
 
   // Logging
-  (*ctx)->log_event_mult           = log_event_mult;
-  (*ctx)->log_event_mult_transpose = log_event_mult_transpose;
+  (*ctx)->log_event_mult                = log_event_mult;
+  (*ctx)->log_event_mult_transpose      = log_event_mult_transpose;
+  (*ctx)->log_event_ceed_mult           = log_event_ceed_mult;
+  (*ctx)->log_event_ceed_mult_transpose = log_event_ceed_mult_transpose;
 
   // PETSc objects
-  PetscCall(PetscObjectReference((PetscObject)dm_x));
-  (*ctx)->dm_x = dm_x;
-  PetscCall(PetscObjectReference((PetscObject)dm_y));
-  (*ctx)->dm_y = dm_y;
-  if (X_loc) PetscCall(PetscObjectReference((PetscObject)X_loc));
-  (*ctx)->X_loc = X_loc;
-  if (Y_loc_transpose) PetscCall(PetscObjectReference((PetscObject)Y_loc_transpose));
-  (*ctx)->Y_loc_transpose = Y_loc_transpose;
+  PetscCall(DMReferenceCopy(dm_x, &(*ctx)->dm_x));
+  PetscCall(DMReferenceCopy(dm_y, &(*ctx)->dm_y));
+  if (X_loc) PetscCall(VecReferenceCopy(X_loc, &(*ctx)->X_loc));
+  if (Y_loc_transpose) PetscCall(VecReferenceCopy(Y_loc_transpose, &(*ctx)->Y_loc_transpose));
 
   // Memtype
   {
@@ -975,7 +1370,6 @@ PetscErrorCode MatCeedContextCreate(DM dm_x, DM dm_y, Vec X_loc, Vec Y_loc_trans
   // libCEED objects
   PetscCheck(CeedOperatorGetCeed(op_mult, &(*ctx)->ceed) == CEED_ERROR_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_LIB,
              "retrieving Ceed context object failed");
-  PetscCallCeed((*ctx)->ceed, CeedReference((*ctx)->ceed));
   PetscCallCeed((*ctx)->ceed, CeedOperatorGetActiveVectorLengths(op_mult, &x_loc_len, &y_loc_len));
   PetscCallCeed((*ctx)->ceed, CeedOperatorReferenceCopy(op_mult, &(*ctx)->op_mult));
   if (op_mult_transpose) PetscCallCeed((*ctx)->ceed, CeedOperatorReferenceCopy(op_mult_transpose, &(*ctx)->op_mult_transpose));
@@ -1062,7 +1456,7 @@ PetscErrorCode MatCeedContextReference(MatCeedContext ctx) {
 PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *ctx_copy) {
   PetscFunctionBeginUser;
   PetscCall(MatCeedContextReference(ctx));
-  PetscCall(MatCeedContextDestroy(*ctx_copy));
+  PetscCall(MatCeedContextDestroy(ctx_copy));
   *ctx_copy = ctx;
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -1076,33 +1470,33 @@ PetscErrorCode MatCeedContextReferenceCopy(MatCeedContext ctx, MatCeedContext *c
 
   @return An error code: 0 - success, otherwise - failure
 **/
-PetscErrorCode MatCeedContextDestroy(MatCeedContext ctx) {
+PetscErrorCode MatCeedContextDestroy(MatCeedContext *ctx) {
   PetscFunctionBeginUser;
-  if (!ctx || --ctx->ref_count > 0) PetscFunctionReturn(PETSC_SUCCESS);
+  if (!ctx || --(*ctx)->ref_count > 0) PetscFunctionReturn(PETSC_SUCCESS);
 
   // PETSc objects
-  PetscCall(DMDestroy(&ctx->dm_x));
-  PetscCall(DMDestroy(&ctx->dm_y));
-  PetscCall(VecDestroy(&ctx->X_loc));
-  PetscCall(VecDestroy(&ctx->Y_loc_transpose));
-  PetscCall(MatDestroy(&ctx->mat_assembled_full_internal));
-  PetscCall(MatDestroy(&ctx->mat_assembled_pbd_internal));
-  PetscCall(PetscFree(ctx->internal_mat_type));
-  PetscCall(PetscFree(ctx->mats_assembled_full));
-  PetscCall(PetscFree(ctx->mats_assembled_pbd));
+  PetscCall(DMDestroy(&(*ctx)->dm_x));
+  PetscCall(DMDestroy(&(*ctx)->dm_y));
+  PetscCall(VecDestroy(&(*ctx)->X_loc));
+  PetscCall(VecDestroy(&(*ctx)->Y_loc_transpose));
+  PetscCall(MatDestroy(&(*ctx)->mat_assembled_full_internal));
+  PetscCall(MatDestroy(&(*ctx)->mat_assembled_pbd_internal));
+  PetscCall(PetscFree((*ctx)->coo_mat_type));
+  PetscCall(PetscFree((*ctx)->mats_assembled_full));
+  PetscCall(PetscFree((*ctx)->mats_assembled_pbd));
 
   // libCEED objects
-  PetscCallCeed(ctx->ceed, CeedVectorDestroy(&ctx->x_loc));
-  PetscCallCeed(ctx->ceed, CeedVectorDestroy(&ctx->y_loc));
-  PetscCallCeed(ctx->ceed, CeedVectorDestroy(&ctx->coo_values_full));
-  PetscCallCeed(ctx->ceed, CeedVectorDestroy(&ctx->coo_values_pbd));
-  PetscCallCeed(ctx->ceed, CeedOperatorDestroy(&ctx->op_mult));
-  PetscCallCeed(ctx->ceed, CeedOperatorDestroy(&ctx->op_mult_transpose));
-  PetscCheck(CeedDestroy(&ctx->ceed) == CEED_ERROR_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_LIB, "destroying libCEED context object failed");
+  PetscCallCeed((*ctx)->ceed, CeedVectorDestroy(&(*ctx)->x_loc));
+  PetscCallCeed((*ctx)->ceed, CeedVectorDestroy(&(*ctx)->y_loc));
+  PetscCallCeed((*ctx)->ceed, CeedVectorDestroy(&(*ctx)->coo_values_full));
+  PetscCallCeed((*ctx)->ceed, CeedVectorDestroy(&(*ctx)->coo_values_pbd));
+  PetscCallCeed((*ctx)->ceed, CeedOperatorDestroy(&(*ctx)->op_mult));
+  PetscCallCeed((*ctx)->ceed, CeedOperatorDestroy(&(*ctx)->op_mult_transpose));
+  PetscCheck(CeedDestroy(&(*ctx)->ceed) == CEED_ERROR_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_LIB, "destroying libCEED context object failed");
 
   // Deallocate
-  ctx->is_destroyed = PETSC_TRUE;  // Flag as destroyed in case someone has stale ref
-  PetscCall(PetscFree(ctx));
+  (*ctx)->is_destroyed = PETSC_TRUE;  // Flag as destroyed in case someone has stale ref
+  PetscCall(PetscFree(*ctx));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -1125,11 +1519,14 @@ PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D) {
   PetscCall(MatShellGetContext(A, &ctx));
 
   // Place PETSc vector in libCEED vector
+  PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_DIAGONAL, A, D, NULL, NULL));
   PetscCall(DMGetLocalVector(ctx->dm_x, &D_loc));
   PetscCall(VecPetscToCeed(D_loc, &mem_type, ctx->x_loc));
 
   // Compute Diagonal
+  PetscCall(PetscLogEventBegin(MATCEED_ASSEMBLE_DIAGONAL_CEEDOP, A, D, NULL, NULL));
   PetscCallCeed(ctx->ceed, CeedOperatorLinearAssembleDiagonal(ctx->op_mult, ctx->x_loc, CEED_REQUEST_IMMEDIATE));
+  PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_DIAGONAL_CEEDOP, A, D, NULL, NULL));
 
   // Restore PETSc vector
   PetscCall(VecCeedToPetsc(ctx->x_loc, mem_type, D_loc));
@@ -1138,6 +1535,7 @@ PetscErrorCode MatGetDiagonal_Ceed(Mat A, Vec D) {
   PetscCall(VecZeroEntries(D));
   PetscCall(DMLocalToGlobal(ctx->dm_x, D_loc, ADD_VALUES, D));
   PetscCall(DMRestoreLocalVector(ctx->dm_x, &D_loc));
+  PetscCall(PetscLogEventEnd(MATCEED_ASSEMBLE_DIAGONAL, A, D, NULL, NULL));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -1157,7 +1555,7 @@ PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y) {
 
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(A, &ctx));
-  PetscCall(PetscLogEventBegin(ctx->log_event_mult, A, X, Y, 0));
+  PetscCall(PetscLogEventBegin(ctx->log_event_mult, A, X, Y, NULL));
 
   {
     PetscMemType x_mem_type, y_mem_type;
@@ -1176,9 +1574,11 @@ PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y) {
     PetscCall(VecPetscToCeed(Y_loc, &y_mem_type, ctx->y_loc));
 
     // Apply libCEED operator
+    PetscCall(PetscLogEventBegin(ctx->log_event_ceed_mult, A, X, Y, NULL));
     PetscCall(PetscLogGpuTimeBegin());
     PetscCallCeed(ctx->ceed, CeedOperatorApplyAdd(ctx->op_mult, ctx->x_loc, ctx->y_loc, CEED_REQUEST_IMMEDIATE));
     PetscCall(PetscLogGpuTimeEnd());
+    PetscCall(PetscLogEventEnd(ctx->log_event_ceed_mult, A, X, Y, NULL));
 
     // Restore PETSc vectors
     PetscCall(VecReadCeedToPetsc(ctx->x_loc, x_mem_type, X_loc));
@@ -1196,8 +1596,7 @@ PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y) {
   // Log flops
   if (PetscMemTypeDevice(ctx->mem_type)) PetscCall(PetscLogGpuFlops(ctx->flops_mult));
   else PetscCall(PetscLogFlops(ctx->flops_mult));
-
-  PetscCall(PetscLogEventEnd(ctx->log_event_mult, A, X, Y, 0));
+  PetscCall(PetscLogEventEnd(ctx->log_event_mult, A, X, Y, NULL));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -1217,7 +1616,7 @@ PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X) {
 
   PetscFunctionBeginUser;
   PetscCall(MatShellGetContext(A, &ctx));
-  PetscCall(PetscLogEventBegin(ctx->log_event_mult_transpose, A, Y, X, 0));
+  PetscCall(PetscLogEventBegin(ctx->log_event_mult_transpose, A, Y, X, NULL));
 
   {
     PetscMemType x_mem_type, y_mem_type;
@@ -1236,9 +1635,11 @@ PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X) {
     PetscCall(VecPetscToCeed(X_loc, &x_mem_type, ctx->x_loc));
 
     // Apply libCEED operator
+    PetscCall(PetscLogEventBegin(ctx->log_event_ceed_mult_transpose, A, Y, X, NULL));
     PetscCall(PetscLogGpuTimeBegin());
     PetscCallCeed(ctx->ceed, CeedOperatorApplyAdd(ctx->op_mult_transpose, ctx->y_loc, ctx->x_loc, CEED_REQUEST_IMMEDIATE));
     PetscCall(PetscLogGpuTimeEnd());
+    PetscCall(PetscLogEventEnd(ctx->log_event_ceed_mult_transpose, A, Y, X, NULL));
 
     // Restore PETSc vectors
     PetscCall(VecReadCeedToPetsc(ctx->y_loc, y_mem_type, Y_loc));
@@ -1256,7 +1657,6 @@ PetscErrorCode MatMultTranspose_Ceed(Mat A, Vec Y, Vec X) {
   // Log flops
   if (PetscMemTypeDevice(ctx->mem_type)) PetscCall(PetscLogGpuFlops(ctx->flops_mult_transpose));
   else PetscCall(PetscLogFlops(ctx->flops_mult_transpose));
-
-  PetscCall(PetscLogEventEnd(ctx->log_event_mult_transpose, A, Y, X, 0));
+  PetscCall(PetscLogEventEnd(ctx->log_event_mult_transpose, A, Y, X, NULL));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c
index e73769741c..cebed9689b 100644
--- a/examples/fluids/src/misc.c
+++ b/examples/fluids/src/misc.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -95,6 +95,7 @@ PetscErrorCode DMPlexInsertBoundaryValues_FromICs(DM dm, PetscBool insert_essent
 
 static PetscErrorCode BinaryReadIntoInt(PetscViewer viewer, PetscInt *out, PetscDataType file_type) {
   PetscFunctionBeginUser;
+  *out = -13;  // appease the overzealous GCC compiler warning Gods
   if (file_type == PETSC_INT32) {
     PetscInt32 val;
     PetscCall(PetscViewerBinaryRead(viewer, &val, 1, NULL, PETSC_INT32));
@@ -140,31 +141,34 @@ PetscErrorCode LoadFluidsBinaryVec(MPI_Comm comm, PetscViewer viewer, Vec Q, Pet
 
 // Compare reference solution values with current test run for CI
 PetscErrorCode RegressionTest(AppCtx app_ctx, Vec Q) {
-  Vec         Qref;
+  Vec         Q_ref;
   PetscViewer viewer;
-  PetscReal   error, Qrefnorm;
+  PetscReal   error, norm_Q, norm_Q_ref;
   MPI_Comm    comm = PetscObjectComm((PetscObject)Q);
 
   PetscFunctionBeginUser;
   // Read reference file
-  PetscCall(VecDuplicate(Q, &Qref));
+  PetscCall(VecDuplicate(Q, &Q_ref));
+  PetscCheck(strcmp(app_ctx->test_file_path, "") != 0, comm, PETSC_ERR_FILE_READ, "File for regression test not given");
   PetscCall(PetscViewerBinaryOpen(comm, app_ctx->test_file_path, FILE_MODE_READ, &viewer));
-  PetscCall(LoadFluidsBinaryVec(comm, viewer, Qref, NULL, NULL));
+  PetscCall(LoadFluidsBinaryVec(comm, viewer, Q_ref, NULL, NULL));
 
   // Compute error with respect to reference solution
-  PetscCall(VecAXPY(Q, -1.0, Qref));
-  PetscCall(VecNorm(Qref, NORM_MAX, &Qrefnorm));
-  PetscCall(VecScale(Q, 1. / Qrefnorm));
+  PetscCall(VecNorm(Q_ref, NORM_MAX, &norm_Q));
+  PetscCall(VecNorm(Q_ref, NORM_MAX, &norm_Q_ref));
+  PetscCall(VecAXPY(Q, -1.0, Q_ref));
+  PetscCall(VecScale(Q, 1. / norm_Q_ref));
   PetscCall(VecNorm(Q, NORM_MAX, &error));
 
   // Check error
   if (error > app_ctx->test_tol) {
-    PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Test failed with error norm %g\n", (double)error));
+    PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Test failed with error norm %g\nReference solution max norm: %g Computed solution max norm %g\n",
+                          (double)error, (double)norm_Q_ref, (double)norm_Q));
   }
 
   // Cleanup
   PetscCall(PetscViewerDestroy(&viewer));
-  PetscCall(VecDestroy(&Qref));
+  PetscCall(VecDestroy(&Q_ref));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -200,7 +204,7 @@ PetscErrorCode PostProcess(TS ts, CeedData ceed_data, DM dm, ProblemData problem
 
   PetscFunctionBeginUser;
   // Print relative error
-  if (problem->non_zero_time && user->app_ctx->test_type == TESTTYPE_NONE) {
+  if (problem->compute_exact_solution_error && user->app_ctx->test_type == TESTTYPE_NONE) {
     PetscCall(PrintError(ceed_data, dm, user, Q, final_time));
   }
 
@@ -340,65 +344,35 @@ PetscErrorCode PhastaDatFileGetNRows(const MPI_Comm comm, const char path[PETSC_
 
 PetscErrorCode PhastaDatFileReadToArrayReal(MPI_Comm comm, const char path[PETSC_MAX_PATH_LEN], PetscReal array[]) {
   PetscInt       dims[2];
-  int            ndims;
   FILE          *fp;
   const PetscInt char_array_len = 512;
   char           line[char_array_len];
-  char         **row_array;
 
   PetscFunctionBeginUser;
   PetscCall(PhastaDatFileOpen(comm, path, char_array_len, dims, &fp));
 
   for (PetscInt i = 0; i < dims[0]; i++) {
+    int    ndims;
+    char **row_array;
+
     PetscCall(PetscSynchronizedFGets(comm, fp, char_array_len, line));
     PetscCall(PetscStrToArray(line, ' ', &ndims, &row_array));
     PetscCheck(ndims == dims[1], comm, PETSC_ERR_FILE_UNEXPECTED,
                "Line %" PetscInt_FMT " of %s does not contain enough columns (%d instead of %" PetscInt_FMT ")", i, path, ndims, dims[1]);
 
-    for (PetscInt j = 0; j < dims[1]; j++) {
-      array[i * dims[1] + j] = (PetscReal)atof(row_array[j]);
-    }
+    for (PetscInt j = 0; j < dims[1]; j++) array[i * dims[1] + j] = (PetscReal)atof(row_array[j]);
+    PetscCall(PetscStrToArrayDestroy(ndims, row_array));
   }
 
   PetscCall(PetscFClose(comm, fp));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
-PetscLogEvent       FLUIDS_CeedOperatorApply;
-PetscLogEvent       FLUIDS_CeedOperatorAssemble;
-PetscLogEvent       FLUIDS_CeedOperatorAssembleDiagonal;
-PetscLogEvent       FLUIDS_CeedOperatorAssemblePointBlockDiagonal;
-PetscLogEvent       FLUIDS_SmartRedis_Init;
-PetscLogEvent       FLUIDS_SmartRedis_Meta;
-PetscLogEvent       FLUIDS_SmartRedis_Train;
-PetscLogEvent       FLUIDS_TrainDataCompute;
-PetscLogEvent       FLUIDS_DifferentialFilter;
-PetscLogEvent       FLUIDS_VelocityGradientProjection;
-static PetscClassId libCEED_classid, onlineTrain_classid, misc_classid;
-
-PetscErrorCode RegisterLogEvents() {
-  PetscFunctionBeginUser;
-  PetscCall(PetscClassIdRegister("libCEED", &libCEED_classid));
-  PetscCall(PetscLogEventRegister("CeedOpApply", libCEED_classid, &FLUIDS_CeedOperatorApply));
-  PetscCall(PetscLogEventRegister("CeedOpAsm", libCEED_classid, &FLUIDS_CeedOperatorAssemble));
-  PetscCall(PetscLogEventRegister("CeedOpAsmD", libCEED_classid, &FLUIDS_CeedOperatorAssembleDiagonal));
-  PetscCall(PetscLogEventRegister("CeedOpAsmPBD", libCEED_classid, &FLUIDS_CeedOperatorAssemblePointBlockDiagonal));
-
-  PetscCall(PetscClassIdRegister("onlineTrain", &onlineTrain_classid));
-  PetscCall(PetscLogEventRegister("SmartRedis_Init", onlineTrain_classid, &FLUIDS_SmartRedis_Init));
-  PetscCall(PetscLogEventRegister("SmartRedis_Meta", onlineTrain_classid, &FLUIDS_SmartRedis_Meta));
-  PetscCall(PetscLogEventRegister("SmartRedis_Train", onlineTrain_classid, &FLUIDS_SmartRedis_Train));
-  PetscCall(PetscLogEventRegister("TrainDataCompute", onlineTrain_classid, &FLUIDS_TrainDataCompute));
-
-  PetscCall(PetscClassIdRegister("Miscellaneous", &misc_classid));
-  PetscCall(PetscLogEventRegister("DiffFilter", misc_classid, &FLUIDS_DifferentialFilter));
-  PetscCall(PetscLogEventRegister("VeloGradProj", misc_classid, &FLUIDS_VelocityGradientProjection));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
 // Print information about the given simulation run
-PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MPI_Comm comm) {
-  Ceed ceed = user->ceed;
+PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, TS ts) {
+  Ceed     ceed = user->ceed;
+  MPI_Comm comm = PetscObjectComm((PetscObject)ts);
+
   PetscFunctionBeginUser;
   // Header and rank
   char        host_name[PETSC_MAX_PATH_LEN];
@@ -427,22 +401,43 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MP
                         "    libCEED Backend MemType            : %s\n",
                         used_resource, CeedMemTypes[mem_type_backend]));
   // PETSc
-  char box_faces_str[PETSC_MAX_PATH_LEN] = "3,3,3";
+  VecType vec_type;
+  char    box_faces_str[PETSC_MAX_PATH_LEN] = "3,3,3";
   if (problem->dim == 2) box_faces_str[3] = '\0';
   PetscCall(PetscOptionsGetString(NULL, NULL, "-dm_plex_box_faces", box_faces_str, sizeof(box_faces_str), NULL));
-  MatType amat_type = user->app_ctx->amat_type, pmat_type;
-  VecType vec_type;
-  PetscCall(DMGetMatType(user->dm, &pmat_type));
-  if (!amat_type) amat_type = pmat_type;
   PetscCall(DMGetVecType(user->dm, &vec_type));
   PetscCall(PetscPrintf(comm,
                         "  PETSc:\n"
                         "    Box Faces                          : %s\n"
-                        "    A MatType                          : %s\n"
-                        "    P MatType                          : %s\n"
                         "    DM VecType                         : %s\n"
                         "    Time Stepping Scheme               : %s\n",
-                        box_faces_str, amat_type, pmat_type, vec_type, phys_ctx->implicit ? "implicit" : "explicit"));
+                        box_faces_str, vec_type, phys_ctx->implicit ? "implicit" : "explicit"));
+  {
+    char           pmat_type_str[PETSC_MAX_PATH_LEN];
+    MatType        amat_type, pmat_type;
+    Mat            Amat, Pmat;
+    TSIJacobianFn *ijacob_function;
+
+    PetscCall(TSGetIJacobian(ts, &Amat, &Pmat, &ijacob_function, NULL));
+    PetscCall(MatGetType(Amat, &amat_type));
+    PetscCall(MatGetType(Pmat, &pmat_type));
+
+    PetscCall(PetscStrncpy(pmat_type_str, pmat_type, sizeof(pmat_type_str)));
+    if (!strcmp(pmat_type, MATCEED)) {
+      MatType pmat_coo_type;
+      char    pmat_coo_type_str[PETSC_MAX_PATH_LEN];
+
+      PetscCall(MatCeedGetCOOMatType(Pmat, &pmat_coo_type));
+      PetscCall(PetscSNPrintf(pmat_coo_type_str, sizeof(pmat_coo_type_str), " (COO MatType: %s)", pmat_coo_type));
+      PetscCall(PetscStrlcat(pmat_type_str, pmat_coo_type_str, sizeof(pmat_type_str)));
+    }
+    if (ijacob_function) {
+      PetscCall(PetscPrintf(comm,
+                            "    IJacobian A MatType                : %s\n"
+                            "    IJacobian P MatType                : %s\n",
+                            amat_type, pmat_type_str));
+    }
+  }
   if (user->app_ctx->cont_steps) {
     PetscCall(PetscPrintf(comm,
                           "  Continue:\n"
@@ -480,9 +475,10 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MP
       part_owned_dofs[1]             = gather_buffer[comm_size - 1];  // max
       part_owned_dofs[2]             = gather_buffer[median_index];   // median
       PetscReal part_owned_dof_ratio = (PetscReal)part_owned_dofs[1] / (PetscReal)part_owned_dofs[2];
-      PetscCall(PetscPrintf(
-          comm, "    Global Vector %" PetscInt_FMT "-DoF nodes          : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n", num_comp_q,
-          part_owned_dofs[0] / num_comp_q, part_owned_dofs[1] / num_comp_q, part_owned_dofs[2] / num_comp_q, part_owned_dof_ratio));
+      PetscCall(PetscPrintf(comm,
+                            "    Global Vector %" PetscInt_FMT "-DoF nodes          : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n",
+                            num_comp_q, part_owned_dofs[0] / num_comp_q, part_owned_dofs[1] / num_comp_q, part_owned_dofs[2] / num_comp_q,
+                            part_owned_dof_ratio));
     }
 
     PetscCallMPI(MPI_Gather(&local_dofs, 1, MPIU_INT, gather_buffer, 1, MPIU_INT, 0, comm));
@@ -492,18 +488,20 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MP
       part_local_dofs[1]             = gather_buffer[comm_size - 1];  // max
       part_local_dofs[2]             = gather_buffer[median_index];   // median
       PetscReal part_local_dof_ratio = (PetscReal)part_local_dofs[1] / (PetscReal)part_local_dofs[2];
-      PetscCall(PetscPrintf(
-          comm, "    Local Vector %" PetscInt_FMT "-DoF nodes           : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n", num_comp_q,
-          part_local_dofs[0] / num_comp_q, part_local_dofs[1] / num_comp_q, part_local_dofs[2] / num_comp_q, part_local_dof_ratio));
+      PetscCall(PetscPrintf(comm,
+                            "    Local Vector %" PetscInt_FMT "-DoF nodes           : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n",
+                            num_comp_q, part_local_dofs[0] / num_comp_q, part_local_dofs[1] / num_comp_q, part_local_dofs[2] / num_comp_q,
+                            part_local_dof_ratio));
     }
 
     if (comm_size != 1) {
       PetscInt num_remote_roots_total = 0, num_remote_leaves_total = 0, num_ghost_interface_ranks = 0, num_owned_interface_ranks = 0;
       {
         PetscSF            sf;
-        PetscInt           nrranks, niranks;
+        PetscMPIInt        nrranks, niranks;
         const PetscInt    *roffset, *rmine, *rremote, *ioffset, *irootloc;
         const PetscMPIInt *rranks, *iranks;
+
         PetscCall(DMGetSectionSF(user->dm, &sf));
         PetscCall(PetscSFGetRootRanks(sf, &nrranks, &rranks, &roffset, &rmine, &rremote));
         PetscCall(PetscSFGetLeafRanks(sf, &niranks, &iranks, &ioffset, &irootloc));
@@ -525,10 +523,11 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MP
         part_boundary_dofs[1]           = gather_buffer[comm_size - 1];  // max
         part_boundary_dofs[2]           = gather_buffer[median_index];   // median
         PetscReal part_shared_dof_ratio = (PetscReal)part_boundary_dofs[1] / (PetscReal)part_boundary_dofs[2];
-        PetscCall(PetscPrintf(
-            comm, "    Ghost Interface %" PetscInt_FMT "-DoF nodes        : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n",
-            num_comp_q, part_boundary_dofs[0] / num_comp_q, part_boundary_dofs[1] / num_comp_q, part_boundary_dofs[2] / num_comp_q,
-            part_shared_dof_ratio));
+        PetscCall(PetscPrintf(comm,
+                              "    Ghost Interface %" PetscInt_FMT "-DoF nodes        : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT
+                              ", %f\n",
+                              num_comp_q, part_boundary_dofs[0] / num_comp_q, part_boundary_dofs[1] / num_comp_q, part_boundary_dofs[2] / num_comp_q,
+                              part_shared_dof_ratio));
       }
 
       PetscCallMPI(MPI_Gather(&num_ghost_interface_ranks, 1, MPIU_INT, gather_buffer, 1, MPIU_INT, 0, comm));
@@ -549,10 +548,11 @@ PetscErrorCode PrintRunInfo(User user, Physics phys_ctx, ProblemData problem, MP
         part_boundary_dofs[1]           = gather_buffer[comm_size - 1];  // max
         part_boundary_dofs[2]           = gather_buffer[median_index];   // median
         PetscReal part_shared_dof_ratio = (PetscReal)part_boundary_dofs[1] / (PetscReal)part_boundary_dofs[2];
-        PetscCall(PetscPrintf(
-            comm, "    Owned Interface %" PetscInt_FMT "-DoF nodes        : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT ", %f\n",
-            num_comp_q, part_boundary_dofs[0] / num_comp_q, part_boundary_dofs[1] / num_comp_q, part_boundary_dofs[2] / num_comp_q,
-            part_shared_dof_ratio));
+        PetscCall(PetscPrintf(comm,
+                              "    Owned Interface %" PetscInt_FMT "-DoF nodes        : %" PetscInt_FMT ", %" PetscInt_FMT ", %" PetscInt_FMT
+                              ", %f\n",
+                              num_comp_q, part_boundary_dofs[0] / num_comp_q, part_boundary_dofs[1] / num_comp_q, part_boundary_dofs[2] / num_comp_q,
+                              part_shared_dof_ratio));
       }
 
       PetscCallMPI(MPI_Gather(&num_owned_interface_ranks, 1, MPIU_INT, gather_buffer, 1, MPIU_INT, 0, comm));
diff --git a/examples/fluids/src/petsc_ops.c b/examples/fluids/src/petsc_ops.c
index 3706751db0..786b081b2e 100644
--- a/examples/fluids/src/petsc_ops.c
+++ b/examples/fluids/src/petsc_ops.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -71,33 +71,37 @@ PetscErrorCode OperatorApplyContextCreate(DM dm_x, DM dm_y, Ceed ceed, CeedOpera
       PetscCall(VecGetLocalSize(X_loc, &X_size));
       PetscCheck(X_size == x_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                  "X_loc (%" PetscInt_FMT ") not correct size for CeedOperator active input size (%" CeedSize_FMT ")", X_size, x_size);
-      if (dm_x)
+      if (dm_x) {
         PetscCheck(X_size == dm_X_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                    "X_loc size (%" PetscInt_FMT ") does not match dm_x local vector size (%" PetscInt_FMT ")", X_size, dm_X_size);
+      }
     }
     if (Y_loc) {
       PetscCall(VecGetLocalSize(Y_loc, &Y_size));
       PetscCheck(Y_size == y_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                  "Y_loc (%" PetscInt_FMT ") not correct size for CeedOperator active output size (%" CeedSize_FMT ")", Y_size, y_size);
-      if (dm_y)
+      if (dm_y) {
         PetscCheck(Y_size == dm_Y_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                    "Y_loc size (%" PetscInt_FMT ") does not match dm_y local vector size (%" PetscInt_FMT ")", Y_size, dm_Y_size);
+      }
     }
     if (x_ceed && x_ceed != CEED_VECTOR_NONE) {
       PetscCallCeed(ceed, CeedVectorGetLength(x_ceed, &x_ceed_size));
       PetscCheck(x_size >= 0 ? x_ceed_size == x_size : true, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                  "x_ceed (%" CeedSize_FMT ") not correct size for CeedOperator active input size (%" CeedSize_FMT ")", x_ceed_size, x_size);
-      if (dm_x)
+      if (dm_x) {
         PetscCheck(x_ceed_size == dm_X_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                    "x_ceed size (%" CeedSize_FMT ") does not match dm_x local vector size (%" PetscInt_FMT ")", x_ceed_size, dm_X_size);
+      }
     }
     if (y_ceed && y_ceed != CEED_VECTOR_NONE) {
       PetscCallCeed(ceed, CeedVectorGetLength(y_ceed, &y_ceed_size));
       PetscCheck(y_ceed_size == y_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                  "y_ceed (%" CeedSize_FMT ") not correct size for CeedOperator active input size (%" CeedSize_FMT ")", y_ceed_size, y_size);
-      if (dm_y)
+      if (dm_y) {
         PetscCheck(y_ceed_size == dm_Y_size, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ,
                    "y_ceed size (%" CeedSize_FMT ") does not match dm_y local vector size (%" PetscInt_FMT ")", y_ceed_size, dm_Y_size);
+      }
     }
   }
 
@@ -176,9 +180,12 @@ VecType DMReturnVecType(DM dm) {
 PetscErrorCode CeedOperatorCreateLocalVecs(CeedOperator op, VecType vec_type, MPI_Comm comm, Vec *input, Vec *output) {
   CeedSize input_size, output_size;
   Ceed     ceed;
+  int      comm_size;
 
   PetscFunctionBeginUser;
   PetscCall(CeedOperatorGetCeed(op, &ceed));
+  PetscCallMPI(MPI_Comm_size(comm, &comm_size));
+  PetscCheck(comm_size == 1, PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "MPI_Comm must be of size 1, recieved comm of size %d", comm_size);
   PetscCallCeed(ceed, CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
   if (input) {
     PetscCall(VecCreate(comm, input));
@@ -190,6 +197,7 @@ PetscErrorCode CeedOperatorCreateLocalVecs(CeedOperator op, VecType vec_type, MP
     PetscCall(VecSetType(*output, vec_type));
     PetscCall(VecSetSizes(*output, output_size, output_size));
   }
+  PetscCheck(CeedDestroy(&ceed) == CEED_ERROR_SUCCESS, comm, PETSC_ERR_LIB, "Destroying Ceed object failed");
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
@@ -306,10 +314,8 @@ PetscErrorCode ApplyAddCeedOperatorLocalToLocal(Vec X_loc, Vec Y_loc, OperatorAp
  */
 PetscErrorCode CreateSolveOperatorsFromMatCeed(KSP ksp, Mat mat_ceed, PetscBool assemble, Mat *Amat, Mat *Pmat) {
   PetscBool use_matceed_pmat, assemble_amat = PETSC_FALSE;
-  MatType   mat_ceed_inner_type;
 
   PetscFunctionBeginUser;
-  PetscCall(MatCeedGetInnerMatType(mat_ceed, &mat_ceed_inner_type));
   {  // Determine if Amat should be MATCEED or assembled
     const char *ksp_prefix = NULL;
 
@@ -320,7 +326,7 @@ PetscErrorCode CreateSolveOperatorsFromMatCeed(KSP ksp, Mat mat_ceed, PetscBool
   }
 
   if (assemble_amat) {
-    PetscCall(MatConvert(mat_ceed, mat_ceed_inner_type, MAT_INITIAL_MATRIX, Amat));
+    PetscCall(MatCeedCreateMatCOO(mat_ceed, Amat));
     if (assemble) PetscCall(MatCeedAssembleCOO(mat_ceed, *Amat));
 
     PetscCall(PetscObjectReference((PetscObject)*Amat));
@@ -337,14 +343,14 @@ PetscErrorCode CreateSolveOperatorsFromMatCeed(KSP ksp, Mat mat_ceed, PetscBool
 
     PetscCall(KSPGetPC(ksp, &pc));
     PetscCall(PCGetType(pc, &pc_type));
-    PetscCall(PetscStrcmpAny(pc_type, &use_matceed_pmat, PCJACOBI, PCVPBJACOBI, PCPBJACOBI, ""));
+    PetscCall(PetscStrcmpAny(pc_type, &use_matceed_pmat, PCNONE, PCJACOBI, PCVPBJACOBI, PCPBJACOBI, ""));
   }
 
   if (use_matceed_pmat) {
     PetscCall(PetscObjectReference((PetscObject)mat_ceed));
     *Pmat = mat_ceed;
   } else {
-    PetscCall(MatConvert(mat_ceed, mat_ceed_inner_type, MAT_INITIAL_MATRIX, Pmat));
+    PetscCall(MatCeedCreateMatCOO(mat_ceed, Pmat));
     if (assemble) PetscCall(MatCeedAssembleCOO(mat_ceed, *Pmat));
   }
   PetscFunctionReturn(PETSC_SUCCESS);
diff --git a/examples/fluids/src/qdata.c b/examples/fluids/src/qdata.c
new file mode 100644
index 0000000000..4288220a6b
--- /dev/null
+++ b/examples/fluids/src/qdata.c
@@ -0,0 +1,199 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "../navierstokes.h"
+
+#include <petscsection.h>
+#include "../qfunctions/setupgeo.h"
+#include "../qfunctions/setupgeo2d.h"
+
+/**
+ * @brief Get number of components of quadrature data for domain
+ *
+ * @param[in]  dm          DM where quadrature data would be used
+ * @param[out] q_data_size Number of components of quadrature data
+ */
+PetscErrorCode QDataGetNumComponents(DM dm, CeedInt *q_data_size) {
+  PetscInt num_comp_x, dim;
+
+  PetscFunctionBeginUser;
+  PetscCall(DMGetDimension(dm, &dim));
+  {  // Get number of coordinate components
+    DM           dm_coord;
+    PetscSection section_coord;
+    PetscInt     field = 0;  // Default field has the coordinates
+    PetscCall(DMGetCoordinateDM(dm, &dm_coord));
+    PetscCall(DMGetLocalSection(dm_coord, &section_coord));
+    PetscCall(PetscSectionGetFieldComponents(section_coord, field, &num_comp_x));
+  }
+  switch (dim) {
+    case 2:
+      switch (num_comp_x) {
+        case 2:
+          *q_data_size = 5;
+          break;
+        case 3:
+          *q_data_size = 7;
+          break;
+        default:
+          SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP,
+                  "QData not valid for DM of dimension %" PetscInt_FMT " and coordinates with dimension %" PetscInt_FMT, dim, num_comp_x);
+          break;
+      }
+      break;
+    case 3:
+      *q_data_size = 10;
+      break;
+    default:
+      SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP,
+              "QData not valid for DM of dimension %" PetscInt_FMT " and coordinates with dimension %" PetscInt_FMT, dim, num_comp_x);
+      break;
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+ * @brief Create quadrature data for domain
+ *
+ * @param[in]  ceed          Ceed object quadrature data will be used with
+ * @param[in]  dm            DM where quadrature data would be used
+ * @param[in]  domain_label  DMLabel that quadrature data would be used one
+ * @param[in]  label_value   Value of label
+ * @param[in]  elem_restr_x  CeedElemRestriction of the coordinates (must match `domain_label` and `label_value` selections)
+ * @param[in]  basis_x       CeedBasis of the coordinates
+ * @param[in]  x_coord       CeedVector of the coordinates
+ * @param[out] elem_restr_qd CeedElemRestriction of the quadrature data
+ * @param[out] q_data        CeedVector of the quadrature data
+ * @param[out] q_data_size   number of components of quadrature data
+ */
+PetscErrorCode QDataGet(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, CeedElemRestriction elem_restr_x, CeedBasis basis_x,
+                        CeedVector x_coord, CeedElemRestriction *elem_restr_qd, CeedVector *q_data, CeedInt *q_data_size) {
+  CeedQFunction qf_setup;
+  CeedOperator  op_setup;
+  CeedInt       num_comp_x;
+  PetscInt      dim, height = 0;
+
+  PetscFunctionBeginUser;
+  PetscCall(QDataGetNumComponents(dm, q_data_size));
+  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_x, &num_comp_x));
+  PetscCall(DMGetDimension(dm, &dim));
+  switch (dim) {
+    case 2:
+      switch (num_comp_x) {
+        case 2:
+          PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, Setup2d, Setup2d_loc, &qf_setup));
+          break;
+        case 3:
+          PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, Setup2D_3Dcoords, Setup2D_3Dcoords_loc, &qf_setup));
+          break;
+      }
+      break;
+    case 3:
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, Setup, Setup_loc, &qf_setup));
+      break;
+  }
+
+  // -- Create QFunction for quadrature data
+  PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_setup, 0));
+  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup, "dx", num_comp_x * (dim - height), CEED_EVAL_GRAD));
+  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT));
+  PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_setup, "surface qdata", *q_data_size, CEED_EVAL_NONE));
+
+  PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, *q_data_size, elem_restr_qd));
+  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(*elem_restr_qd, q_data, NULL));
+
+  PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_setup, NULL, NULL, &op_setup));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_setup, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_setup, "surface qdata", *elem_restr_qd, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+
+  PetscCallCeed(ceed, CeedOperatorApply(op_setup, x_coord, *q_data, CEED_REQUEST_IMMEDIATE));
+
+  PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup));
+  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_setup));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+ * @brief Get number of components of quadrature data for boundary of domain
+ *
+ * @param[in]  dm          DM where quadrature data would be used
+ * @param[out] q_data_size Number of components of quadrature data
+ */
+PetscErrorCode QDataBoundaryGetNumComponents(DM dm, CeedInt *q_data_size) {
+  PetscInt dim;
+
+  PetscFunctionBeginUser;
+  PetscCall(DMGetDimension(dm, &dim));
+  switch (dim) {
+    case 2:
+      *q_data_size = 3;
+      break;
+    case 3:
+      *q_data_size = 10;
+      break;
+    default:
+      SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP, "QDataBoundary not valid for DM of dimension %" PetscInt_FMT, dim);
+      break;
+  }
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+/**
+ * @brief Create quadrature data for boundary of domain
+ *
+ * @param[in]  ceed          Ceed object quadrature data will be used with
+ * @param[in]  dm            DM where quadrature data would be used
+ * @param[in]  domain_label  DMLabel that quadrature data would be used one
+ * @param[in]  label_value   Value of label
+ * @param[in]  elem_restr_x  CeedElemRestriction of the coordinates (must match `domain_label` and `label_value` selections)
+ * @param[in]  basis_x       CeedBasis of the coordinates
+ * @param[in]  x_coord       CeedVector of the coordinates
+ * @param[out] elem_restr_qd CeedElemRestriction of the quadrature data
+ * @param[out] q_data        CeedVector of the quadrature data
+ * @param[out] q_data_size   number of components of quadrature data
+ */
+PetscErrorCode QDataBoundaryGet(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, CeedElemRestriction elem_restr_x, CeedBasis basis_x,
+                                CeedVector x_coord, CeedElemRestriction *elem_restr_qd, CeedVector *q_data, CeedInt *q_data_size) {
+  CeedQFunction qf_setup_sur;
+  CeedOperator  op_setup_sur;
+  CeedInt       num_comp_x;
+  PetscInt      dim, height = 1;
+
+  PetscFunctionBeginUser;
+  PetscCall(QDataBoundaryGetNumComponents(dm, q_data_size));
+  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_x, &num_comp_x));
+  PetscCall(DMGetDimension(dm, &dim));
+  switch (dim) {
+    case 2:
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, SetupBoundary2d, SetupBoundary2d_loc, &qf_setup_sur));
+      break;
+    case 3:
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, SetupBoundary, SetupBoundary_loc, &qf_setup_sur));
+      break;
+  }
+
+  // -- Create QFunction for quadrature data
+  PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_setup_sur, 0));
+  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup_sur, "dx", num_comp_x * (dim - height), CEED_EVAL_GRAD));
+  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_setup_sur, "weight", 1, CEED_EVAL_WEIGHT));
+  PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_setup_sur, "surface qdata", *q_data_size, CEED_EVAL_NONE));
+
+  PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, *q_data_size, elem_restr_qd));
+  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(*elem_restr_qd, q_data, NULL));
+
+  PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_setup_sur, NULL, NULL, &op_setup_sur));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "surface qdata", *elem_restr_qd, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+
+  PetscCallCeed(ceed, CeedOperatorApply(op_setup_sur, x_coord, *q_data, CEED_REQUEST_IMMEDIATE));
+
+  PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_sur));
+  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_setup_sur));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
diff --git a/examples/fluids/src/setupdm.c b/examples/fluids/src/setupdm.c
index a0df7dfe69..25573ca63d 100644
--- a/examples/fluids/src/setupdm.c
+++ b/examples/fluids/src/setupdm.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -52,24 +52,19 @@ PetscErrorCode SetUpDM(DM dm, ProblemData problem, PetscInt degree, PetscInt q_e
     DMLabel label;
     PetscCall(DMGetLabel(dm, "Face Sets", &label));
     PetscCall(DMPlexLabelComplete(dm, label));
-    // Set wall BCs
-    if (bc->num_wall > 0) {
-      PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "wall", label, bc->num_wall, bc->walls, 0, bc->num_comps, bc->wall_comps, NULL, NULL, NULL, NULL));
-    }
-    // Set symmetry BCs in the x direction
-    if (bc->num_symmetry[0] > 0) {
-      PetscInt comps[1] = {1};
-      PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "symmetry_x", label, bc->num_symmetry[0], bc->symmetries[0], 0, 1, comps, NULL, NULL, NULL, NULL));
-    }
-    // Set symmetry BCs in the y direction
-    if (bc->num_symmetry[1] > 0) {
-      PetscInt comps[1] = {2};
-      PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "symmetry_y", label, bc->num_symmetry[1], bc->symmetries[1], 0, 1, comps, NULL, NULL, NULL, NULL));
-    }
-    // Set symmetry BCs in the z direction
-    if (bc->num_symmetry[2] > 0) {
-      PetscInt comps[1] = {3};
-      PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "symmetry_z", label, bc->num_symmetry[2], bc->symmetries[2], 0, 1, comps, NULL, NULL, NULL, NULL));
+
+    for (PetscInt i = 0; i < problem->num_bc_defs; i++) {
+      BCDefinition    bc_def = problem->bc_defs[i];
+      PetscInt        num_essential_comps, num_label_values;
+      const PetscInt *essential_comps, *label_values;
+      const char     *name;
+
+      PetscCall(BCDefinitionGetEssential(bc_def, &num_essential_comps, &essential_comps));
+      if (essential_comps > 0) {
+        PetscCall(BCDefinitionGetInfo(bc_def, &name, &num_label_values, &label_values));
+        PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, name, label, num_label_values, label_values, 0, num_essential_comps, essential_comps, NULL, NULL,
+                                NULL, NULL));
+      }
     }
     {
       PetscBool use_strongstg = PETSC_FALSE;
@@ -100,6 +95,14 @@ PetscErrorCode SetUpDM(DM dm, ProblemData problem, PetscInt degree, PetscInt q_e
       PetscCall(PetscSectionSetComponentName(section, 0, 3, "VelocityZ"));
       PetscCall(PetscSectionSetComponentName(section, 0, 4, "Temperature"));
       break;
+
+    case STATEVAR_ENTROPY:
+      PetscCall(PetscSectionSetComponentName(section, 0, 0, "EntropyDensity"));
+      PetscCall(PetscSectionSetComponentName(section, 0, 1, "EntropyMomentumX"));
+      PetscCall(PetscSectionSetComponentName(section, 0, 2, "EntropyMomentumY"));
+      PetscCall(PetscSectionSetComponentName(section, 0, 3, "EntropyMomentumZ"));
+      PetscCall(PetscSectionSetComponentName(section, 0, 4, "EntropyTotalEnergy"));
+      break;
   }
   PetscFunctionReturn(PETSC_SUCCESS);
 }
diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c
index 18630c0279..bb801aa269 100644
--- a/examples/fluids/src/setuplibceed.c
+++ b/examples/fluids/src/setuplibceed.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -28,14 +28,12 @@ static PetscErrorCode CreateKSPMassOperator_Unstabilized(User user, CeedOperator
     CeedOperatorField field;
     PetscInt          sub_op_index = 0;  // will be 0 for the volume op
 
-    PetscCallCeed(ceed, CeedCompositeOperatorGetSubList(user->op_rhs_ctx->op, &sub_ops));
+    PetscCallCeed(ceed, CeedOperatorCompositeGetSubList(user->op_rhs_ctx->op, &sub_ops));
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q));
-    PetscCallCeed(ceed, CeedOperatorFieldGetBasis(field, &basis_q));
+    PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_q, &basis_q, NULL));
 
     PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "qdata", &field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_qd_i));
-    PetscCallCeed(ceed, CeedOperatorFieldGetVector(field, &q_data));
+    PetscCallCeed(ceed, CeedOperatorFieldGetData(field, NULL, &elem_restr_qd_i, NULL, &q_data));
   }
 
   PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_q, &num_comp_q));
@@ -47,6 +45,10 @@ static PetscErrorCode CreateKSPMassOperator_Unstabilized(User user, CeedOperator
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, q_data));
   PetscCallCeed(ceed, CeedOperatorSetField(*op_mass, "v", elem_restr_q, basis_q, CEED_VECTOR_ACTIVE));
 
+  PetscCallCeed(ceed, CeedVectorDestroy(&q_data));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd_i));
+  PetscCallCeed(ceed, CeedBasisDestroy(&basis_q));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -68,7 +70,7 @@ static PetscErrorCode CreateKSPMass(User user, ProblemData problem) {
 
     PetscCall(DMCreateLocalVector(dm, &Zeros_loc));
     PetscCall(VecZeroEntries(Zeros_loc));
-    PetscCall(MatCeedCreate(dm, dm, op_mass, NULL, &mat_mass));
+    PetscCall(MatCreateCeed(dm, dm, op_mass, NULL, &mat_mass));
     PetscCall(MatCeedSetLocalVectors(mat_mass, Zeros_loc, NULL));
 
     PetscCall(KSPCreate(comm, &user->mass_ksp));
@@ -81,7 +83,6 @@ static PetscErrorCode CreateKSPMass(User user, ProblemData problem) {
       PetscCall(KSPSetType(user->mass_ksp, KSPPREONLY));
     }
     PetscCall(KSPSetFromOptions_WithMatCeed(user->mass_ksp, mat_mass));
-    PetscCall(KSPSetFromOptions(user->mass_ksp));
     PetscCall(VecDestroy(&Zeros_loc));
     PetscCall(MatDestroy(&mat_mass));
   }
@@ -90,128 +91,66 @@ static PetscErrorCode CreateKSPMass(User user, ProblemData problem) {
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
-PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DMLabel domain_label, PetscInt label_value, CeedInt height, CeedInt Q_sur,
-                                CeedInt q_data_size_sur, CeedInt jac_data_size_sur, CeedQFunction qf_apply_bc, CeedQFunction qf_apply_bc_jacobian,
-                                CeedOperator *op_apply, CeedOperator *op_apply_ijacobian) {
-  CeedVector          q_data_sur, jac_data_sur = NULL;
-  CeedOperator        op_setup_sur, op_apply_bc, op_apply_bc_jacobian = NULL;
+static PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DMLabel domain_label, PetscInt label_value, CeedInt height,
+                                       CeedInt Q_sur, CeedInt q_data_size_sur, CeedInt jac_data_size_sur, CeedBasis basis_q_sur,
+                                       CeedBasis basis_x_sur, CeedQFunction qf_apply_bc, CeedQFunction qf_apply_bc_jacobian, CeedOperator op_apply,
+                                       CeedOperator op_apply_ijacobian) {
+  CeedVector          q_data_sur, jac_data_sur          = NULL;
+  CeedOperator        op_apply_bc, op_apply_bc_jacobian = NULL;
   CeedElemRestriction elem_restr_x_sur, elem_restr_q_sur, elem_restr_qd_i_sur, elem_restr_jd_i_sur = NULL;
-  CeedInt             num_qpts_sur, dm_field = 0;
+  PetscInt            dm_field = 0;
 
   PetscFunctionBeginUser;
-  // --- Get number of quadrature points for the boundaries
-  PetscCallCeed(ceed, CeedBasisGetNumQuadraturePoints(ceed_data->basis_q_sur, &num_qpts_sur));
-
-  // ---- CEED Restriction
   PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, dm_field, &elem_restr_q_sur));
   PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &elem_restr_x_sur));
-  PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, q_data_size_sur, &elem_restr_qd_i_sur));
   if (jac_data_size_sur > 0) {
     // State-dependent data will be passed from residual to Jacobian. This will be collocated.
     PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, jac_data_size_sur, &elem_restr_jd_i_sur));
     PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i_sur, &jac_data_sur, NULL));
   }
 
-  // ---- CEED Vector
-  CeedInt loc_num_elem_sur;
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumElements(elem_restr_q_sur, &loc_num_elem_sur));
-  PetscCallCeed(ceed, CeedVectorCreate(ceed, q_data_size_sur * loc_num_elem_sur * num_qpts_sur, &q_data_sur));
-
-  // ---- CEED Operator
-  // ----- CEED Operator for Setup (geometric factors)
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_setup_sur, NULL, NULL, &op_setup_sur));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "dx", elem_restr_x_sur, ceed_data->basis_x_sur, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, ceed_data->basis_x_sur, CEED_VECTOR_NONE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+  PetscCall(QDataBoundaryGet(ceed, dm, domain_label, label_value, elem_restr_x_sur, basis_x_sur, ceed_data->x_coord, &elem_restr_qd_i_sur,
+                             &q_data_sur, &q_data_size_sur));
 
-  // ----- CEED Operator for Physics
+  // CEED Operator for Physics
   PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_apply_bc, NULL, NULL, &op_apply_bc));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "q", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "Grad_q", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "q", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "Grad_q", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
   PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_NONE, q_data_sur));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "x", elem_restr_x_sur, ceed_data->basis_x_sur, ceed_data->x_coord));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "v", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE));
-  if (elem_restr_jd_i_sur)
+  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "x", elem_restr_x_sur, basis_x_sur, ceed_data->x_coord));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "v", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
+  if (elem_restr_jd_i_sur) {
     PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc, "surface jacobian data", elem_restr_jd_i_sur, CEED_BASIS_NONE, jac_data_sur));
+  }
 
   if (qf_apply_bc_jacobian && elem_restr_jd_i_sur) {
     PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_apply_bc_jacobian, NULL, NULL, &op_apply_bc_jacobian));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "dq", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "Grad_dq", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "dq", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "Grad_dq", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
     PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_NONE, q_data_sur));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "x", elem_restr_x_sur, ceed_data->basis_x_sur, ceed_data->x_coord));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "x", elem_restr_x_sur, basis_x_sur, ceed_data->x_coord));
     PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "surface jacobian data", elem_restr_jd_i_sur, CEED_BASIS_NONE, jac_data_sur));
-    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "v", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_apply_bc_jacobian, "v", elem_restr_q_sur, basis_q_sur, CEED_VECTOR_ACTIVE));
   }
 
-  // ----- Apply CEED operator for Setup
-  PetscCallCeed(ceed, CeedOperatorApply(op_setup_sur, ceed_data->x_coord, q_data_sur, CEED_REQUEST_IMMEDIATE));
-
-  // ----- Apply Sub-Operator for Physics
-  PetscCallCeed(ceed, CeedCompositeOperatorAddSub(*op_apply, op_apply_bc));
-  if (op_apply_bc_jacobian) PetscCallCeed(ceed, CeedCompositeOperatorAddSub(*op_apply_ijacobian, op_apply_bc_jacobian));
+  // Apply Sub-Operator for Physics
+  PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_apply, op_apply_bc));
+  if (op_apply_bc_jacobian) PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_apply_ijacobian, op_apply_bc_jacobian));
 
-  // ----- Cleanup
   PetscCallCeed(ceed, CeedVectorDestroy(&q_data_sur));
   PetscCallCeed(ceed, CeedVectorDestroy(&jac_data_sur));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q_sur));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_x_sur));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd_i_sur));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_jd_i_sur));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_sur));
   PetscCallCeed(ceed, CeedOperatorDestroy(&op_apply_bc));
   PetscCallCeed(ceed, CeedOperatorDestroy(&op_apply_bc_jacobian));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
-// Utility function to create CEED Composite Operator for the entire domain
-PetscErrorCode CreateOperatorForDomain(Ceed ceed, DM dm, SimpleBC bc, CeedData ceed_data, Physics phys, CeedOperator op_apply_vol,
-                                       CeedOperator op_apply_ijacobian_vol, CeedInt height, CeedInt P_sur, CeedInt Q_sur, CeedInt q_data_size_sur,
-                                       CeedInt jac_data_size_sur, CeedOperator *op_apply, CeedOperator *op_apply_ijacobian) {
-  DMLabel domain_label;
-
-  PetscFunctionBeginUser;
-  // Create Composite Operaters
-  PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, op_apply));
-  if (op_apply_ijacobian) PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, op_apply_ijacobian));
-
-  // --Apply Sub-Operator for the volume
-  PetscCallCeed(ceed, CeedCompositeOperatorAddSub(*op_apply, op_apply_vol));
-  if (op_apply_ijacobian) PetscCallCeed(ceed, CeedCompositeOperatorAddSub(*op_apply_ijacobian, op_apply_ijacobian_vol));
-
-  // -- Create Sub-Operator for in/outflow BCs
-  PetscCall(DMGetLabel(dm, "Face Sets", &domain_label));
-
-  // --- Create Sub-Operator for inflow boundaries
-  for (CeedInt i = 0; i < bc->num_inflow; i++) {
-    PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->inflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur,
-                               ceed_data->qf_apply_inflow, ceed_data->qf_apply_inflow_jacobian, op_apply, op_apply_ijacobian));
-  }
-  // --- Create Sub-Operator for outflow boundaries
-  for (CeedInt i = 0; i < bc->num_outflow; i++) {
-    PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->outflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur,
-                               ceed_data->qf_apply_outflow, ceed_data->qf_apply_outflow_jacobian, op_apply, op_apply_ijacobian));
-  }
-  // --- Create Sub-Operator for freestream boundaries
-  for (CeedInt i = 0; i < bc->num_freestream; i++) {
-    PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->freestreams[i], height, Q_sur, q_data_size_sur, jac_data_size_sur,
-                               ceed_data->qf_apply_freestream, ceed_data->qf_apply_freestream_jacobian, op_apply, op_apply_ijacobian));
-  }
-  // --- Create Sub-Operator for slip boundaries
-  for (CeedInt i = 0; i < bc->num_slip; i++) {
-    PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->slips[i], height, Q_sur, q_data_size_sur, jac_data_size_sur,
-                               ceed_data->qf_apply_slip, ceed_data->qf_apply_slip_jacobian, op_apply, op_apply_ijacobian));
-  }
-
-  // ----- Get Context Labels for Operator
-  PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(*op_apply, "solution time", &phys->solution_time_label));
-  PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(*op_apply, "timestep size", &phys->timestep_size_label));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt num_comp_x, PetscInt num_comp_q, PetscInt q_data_size_sur,
-                                 PetscInt jac_data_size_sur, ProblemQFunctionSpec apply_bc, ProblemQFunctionSpec apply_bc_jacobian,
-                                 CeedQFunction *qf_apply_bc, CeedQFunction *qf_apply_bc_jacobian) {
+static PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt num_comp_x, PetscInt num_comp_q, PetscInt q_data_size_sur,
+                                        PetscInt jac_data_size_sur, ProblemQFunctionSpec apply_bc, ProblemQFunctionSpec apply_bc_jacobian,
+                                        CeedQFunction *qf_apply_bc, CeedQFunction *qf_apply_bc_jacobian) {
   PetscFunctionBeginUser;
   if (apply_bc.qfunction) {
     PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, apply_bc.qfunction, apply_bc.qfunction_loc, qf_apply_bc));
@@ -238,14 +177,117 @@ PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt num_comp_
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
+// Utility function to add boundary operators to the composite operator
+static PetscErrorCode AddBCSubOperators(User user, Ceed ceed, DM dm, SimpleBC bc, ProblemData problem, CeedData ceed_data, CeedOperator op_apply,
+                                        CeedOperator op_apply_ijacobian) {
+  CeedInt       height = 1, num_comp_q, num_comp_x;
+  CeedInt       P_sur = user->app_ctx->degree + 1, Q_sur = P_sur + user->app_ctx->q_extra, dim_sur, q_data_size_sur;
+  const CeedInt jac_data_size_sur = user->phys->implicit ? problem->jac_data_size_sur : 0;
+  PetscInt      dim;
+  DMLabel       face_sets_label;
+  CeedBasis     basis_q_sur, basis_x_sur;
+
+  PetscFunctionBeginUser;
+  PetscCall(DMGetDimension(dm, &dim));
+  PetscCall(QDataBoundaryGetNumComponents(dm, &q_data_size_sur));
+  dim_sur = dim - height;
+  {  // Get number of components and coordinate dimension from op_apply
+    CeedOperator       *sub_ops;
+    CeedOperatorField   field;
+    PetscInt            sub_op_index = 0;  // will be 0 for the volume op
+    CeedElemRestriction elem_restr_q, elem_restr_x;
+
+    PetscCallCeed(ceed, CeedOperatorCompositeGetSubList(op_apply, &sub_ops));
+    PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "q", &field));
+    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_q));
+    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_q, &num_comp_q));
+    PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_q));
+
+    PetscCallCeed(ceed, CeedOperatorGetFieldByName(sub_ops[sub_op_index], "x", &field));
+    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(field, &elem_restr_x));
+    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_x, &num_comp_x));
+    PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_x));
+  }
+
+  {  // Get bases
+    DM dm_coord;
+
+    PetscCall(DMGetCoordinateDM(dm, &dm_coord));
+    DMLabel  label       = NULL;
+    PetscInt label_value = 0;
+    PetscInt field       = 0;
+    PetscCall(CreateBasisFromPlex(ceed, dm, label, label_value, height, field, &basis_q_sur));
+    PetscCall(CreateBasisFromPlex(ceed, dm_coord, label, label_value, height, field, &basis_x_sur));
+  }
+
+  PetscCall(DMGetLabel(dm, "Face Sets", &face_sets_label));
+
+  {  // --- Create Sub-Operator for inflow boundaries
+    CeedQFunction qf_apply_inflow = NULL, qf_apply_inflow_jacobian = NULL;
+
+    PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_inflow,
+                                problem->apply_inflow_jacobian, &qf_apply_inflow, &qf_apply_inflow_jacobian));
+    for (CeedInt i = 0; i < bc->num_inflow; i++) {
+      PetscCall(AddBCSubOperator(ceed, dm, ceed_data, face_sets_label, bc->inflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, basis_q_sur,
+                                 basis_x_sur, qf_apply_inflow, qf_apply_inflow_jacobian, op_apply, op_apply_ijacobian));
+    }
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_inflow));
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_inflow_jacobian));
+  }
+
+  {  // --- Create Sub-Operator for outflow boundaries
+    CeedQFunction qf_apply_outflow = NULL, qf_apply_outflow_jacobian = NULL;
+
+    PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_outflow,
+                                problem->apply_outflow_jacobian, &qf_apply_outflow, &qf_apply_outflow_jacobian));
+    for (CeedInt i = 0; i < bc->num_outflow; i++) {
+      PetscCall(AddBCSubOperator(ceed, dm, ceed_data, face_sets_label, bc->outflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur,
+                                 basis_q_sur, basis_x_sur, qf_apply_outflow, qf_apply_outflow_jacobian, op_apply, op_apply_ijacobian));
+    }
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_outflow));
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_outflow_jacobian));
+  }
+
+  {  // --- Create Sub-Operator for freestream boundaries
+    CeedQFunction qf_apply_freestream = NULL, qf_apply_freestream_jacobian = NULL;
+
+    PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_freestream,
+                                problem->apply_freestream_jacobian, &qf_apply_freestream, &qf_apply_freestream_jacobian));
+    for (CeedInt i = 0; i < bc->num_freestream; i++) {
+      PetscCall(AddBCSubOperator(ceed, dm, ceed_data, face_sets_label, bc->freestreams[i], height, Q_sur, q_data_size_sur, jac_data_size_sur,
+                                 basis_q_sur, basis_x_sur, qf_apply_freestream, qf_apply_freestream_jacobian, op_apply, op_apply_ijacobian));
+    }
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_freestream));
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_freestream_jacobian));
+  }
+
+  {  // --- Create Sub-Operator for slip boundaries
+    CeedQFunction qf_apply_slip = NULL, qf_apply_slip_jacobian = NULL;
+
+    PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_slip,
+                                problem->apply_slip_jacobian, &qf_apply_slip, &qf_apply_slip_jacobian));
+    for (CeedInt i = 0; i < bc->num_slip; i++) {
+      PetscCall(AddBCSubOperator(ceed, dm, ceed_data, face_sets_label, bc->slips[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, basis_q_sur,
+                                 basis_x_sur, qf_apply_slip, qf_apply_slip_jacobian, op_apply, op_apply_ijacobian));
+    }
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_slip));
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_apply_slip_jacobian));
+  }
+
+  PetscCallCeed(ceed, CeedBasisDestroy(&basis_q_sur));
+  PetscCallCeed(ceed, CeedBasisDestroy(&basis_x_sur));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
 PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData problem, SimpleBC bc) {
+  const PetscInt      num_comp_q = 5;
+  const CeedInt       dim = problem->dim, num_comp_x = problem->dim;
+  CeedInt             jac_data_size_vol = num_comp_q + 6 + 3;
+  CeedElemRestriction elem_restr_jd_i;
+  CeedVector          jac_data;
+  CeedOperator        op_ifunction_vol = NULL, op_rhs_vol = NULL, op_ijacobian_vol = NULL;
+
   PetscFunctionBeginUser;
-  // *****************************************************************************
-  // Set up CEED objects for the interior domain (volume)
-  // *****************************************************************************
-  const PetscInt num_comp_q = 5;
-  const CeedInt  dim = problem->dim, num_comp_x = problem->dim, q_data_size_vol = problem->q_data_size_vol;
-  CeedInt        jac_data_size_vol = num_comp_q + 6 + 3;
 
   if (problem->apply_vol_ifunction.qfunction && problem->uses_newtonian) {
     NewtonianIdealGasContext gas;
@@ -254,265 +296,193 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, App
     PetscCallCeed(ceed, CeedQFunctionContextRestoreDataRead(problem->apply_vol_ifunction.qfunction_context, &gas));
   }
 
-  CeedElemRestriction elem_restr_jd_i;
-  CeedVector          jac_data;
-  CeedInt             num_qpts;
-  DMLabel             domain_label = NULL;
-  PetscInt            label_value = 0, height = 0, dm_field = 0;
-
-  // -----------------------------------------------------------------------------
-  // CEED Bases
-  // -----------------------------------------------------------------------------
-  DM dm_coord;
-  PetscCall(DMGetCoordinateDM(dm, &dm_coord));
-
-  PetscCall(CreateBasisFromPlex(ceed, dm, domain_label, label_value, height, dm_field, &ceed_data->basis_q));
-  PetscCall(CreateBasisFromPlex(ceed, dm_coord, domain_label, label_value, height, dm_field, &ceed_data->basis_x));
-  PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x, ceed_data->basis_q, &ceed_data->basis_xc));
-  PetscCallCeed(ceed, CeedBasisGetNumQuadraturePoints(ceed_data->basis_q, &num_qpts));
-
-  // -----------------------------------------------------------------------------
-  // CEED Restrictions
-  // -----------------------------------------------------------------------------
-  // -- Create restriction
-  PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, 0, &ceed_data->elem_restr_q));
-  PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &ceed_data->elem_restr_x));
-  PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, q_data_size_vol, &ceed_data->elem_restr_qd_i));
-  PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, jac_data_size_vol, &elem_restr_jd_i));
-  // -- Create E vectors
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_ceed, NULL));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_dot_ceed, NULL));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->g_ceed, NULL));
-
-  // -----------------------------------------------------------------------------
-  // CEED QFunctions
-  // -----------------------------------------------------------------------------
-  // -- Create QFunction for quadrature data
-  PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->setup_vol.qfunction, problem->setup_vol.qfunction_loc, &ceed_data->qf_setup_vol));
-  if (problem->setup_vol.qfunction_context) {
-    PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_setup_vol, problem->setup_vol.qfunction_context));
+  {  // Create bases and element restrictions
+    DMLabel  domain_label = NULL;
+    PetscInt label_value = 0, height = 0, dm_field = 0;
+    DM       dm_coord;
+
+    PetscCall(DMGetCoordinateDM(dm, &dm_coord));
+    PetscCall(CreateBasisFromPlex(ceed, dm, domain_label, label_value, height, dm_field, &ceed_data->basis_q));
+    PetscCall(CreateBasisFromPlex(ceed, dm_coord, domain_label, label_value, height, dm_field, &ceed_data->basis_x));
+
+    PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, 0, &ceed_data->elem_restr_q));
+    PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &ceed_data->elem_restr_x));
+    PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, jac_data_size_vol, &elem_restr_jd_i));
+
+    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_ceed, NULL));
+    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_dot_ceed, NULL));
+    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->g_ceed, NULL));
+    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_x, &ceed_data->x_coord, NULL));
+    PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i, &jac_data, NULL));
+
+    {  // -- Copy PETSc coordinate vector into CEED vector
+      Vec X_loc;
+      DM  cdm;
+
+      PetscCall(DMGetCellCoordinateDM(dm, &cdm));
+      if (cdm) {
+        PetscCall(DMGetCellCoordinatesLocal(dm, &X_loc));
+      } else {
+        PetscCall(DMGetCoordinatesLocal(dm, &X_loc));
+      }
+      PetscCall(VecScale(X_loc, problem->dm_scale));
+      PetscCall(VecCopyPetscToCeed(X_loc, ceed_data->x_coord));
+    }
+
+    PetscCall(QDataGet(ceed, dm, domain_label, label_value, ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord,
+                       &ceed_data->elem_restr_qd_i, &ceed_data->q_data, &problem->q_data_size_vol));
+  }
+
+  {  // -- Create QFunction for ICs
+    CeedBasis     basis_xc;
+    CeedQFunction qf_ics;
+    CeedOperator  op_ics;
+
+    PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x, ceed_data->basis_q, &basis_xc));
+    PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->ics.qfunction, problem->ics.qfunction_loc, &qf_ics));
+    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_ics, problem->ics.qfunction_context));
+    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_ics, 0));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ics, "x", num_comp_x, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ics, "dx", num_comp_x * dim, CEED_EVAL_GRAD));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ics, "q0", num_comp_q, CEED_EVAL_NONE));
+
+    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ics, NULL, NULL, &op_ics));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "x", ceed_data->elem_restr_x, basis_xc, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "dx", ceed_data->elem_restr_x, basis_xc, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "q0", ceed_data->elem_restr_q, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_ics, "evaluation time", &user->phys->ics_time_label));
+    PetscCall(OperatorApplyContextCreate(NULL, dm, user->ceed, op_ics, ceed_data->x_coord, NULL, NULL, user->Q_loc, &ceed_data->op_ics_ctx));
+
+    PetscCallCeed(ceed, CeedBasisDestroy(&basis_xc));
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ics));
+    PetscCallCeed(ceed, CeedOperatorDestroy(&op_ics));
   }
-  PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_setup_vol, 0));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_vol, "dx", num_comp_x * dim, CEED_EVAL_GRAD));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_vol, "weight", 1, CEED_EVAL_WEIGHT));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_setup_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
-
-  // -- Create QFunction for ICs
-  PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->ics.qfunction, problem->ics.qfunction_loc, &ceed_data->qf_ics));
-  PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_ics, problem->ics.qfunction_context));
-  PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_ics, 0));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ics, "x", num_comp_x, CEED_EVAL_INTERP));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ics, "dx", num_comp_x * dim, CEED_EVAL_GRAD));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_ics, "q0", num_comp_q, CEED_EVAL_NONE));
-
-  // -- Create QFunction for RHS
+
   if (problem->apply_vol_rhs.qfunction) {
-    PetscCallCeed(
-        ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_rhs.qfunction, problem->apply_vol_rhs.qfunction_loc, &ceed_data->qf_rhs_vol));
-    PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_rhs_vol, problem->apply_vol_rhs.qfunction_context));
-    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_rhs_vol, 0));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "q", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "x", num_comp_x, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_rhs_vol, "v", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_rhs_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
+    CeedQFunction qf_rhs_vol;
+
+    PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_rhs.qfunction, problem->apply_vol_rhs.qfunction_loc, &qf_rhs_vol));
+    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_rhs_vol, problem->apply_vol_rhs.qfunction_context));
+    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_rhs_vol, 0));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "q", num_comp_q, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "qdata", problem->q_data_size_vol, CEED_EVAL_NONE));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_rhs_vol, "x", num_comp_x, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_rhs_vol, "v", num_comp_q, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_rhs_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
+
+    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_rhs_vol, NULL, NULL, &op_rhs_vol));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_rhs_vol, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_rhs_vol));
   }
 
-  // -- Create QFunction for IFunction
   if (problem->apply_vol_ifunction.qfunction) {
+    CeedQFunction qf_ifunction_vol;
+
     PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ifunction.qfunction, problem->apply_vol_ifunction.qfunction_loc,
-                                                    &ceed_data->qf_ifunction_vol));
-    PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_ifunction_vol, problem->apply_vol_ifunction.qfunction_context));
-    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_ifunction_vol, 0));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "q", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "q dot", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "x", num_comp_x, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "v", num_comp_q, CEED_EVAL_INTERP));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
-    PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE));
+                                                    &qf_ifunction_vol));
+    PetscCallCeed(ceed, CeedQFunctionSetContext(qf_ifunction_vol, problem->apply_vol_ifunction.qfunction_context));
+    PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_ifunction_vol, 0));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "q", num_comp_q, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "q dot", num_comp_q, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "qdata", problem->q_data_size_vol, CEED_EVAL_NONE));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ifunction_vol, "x", num_comp_x, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "v", num_comp_q, CEED_EVAL_INTERP));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
+    PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ifunction_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE));
+
+    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ifunction_vol, NULL, NULL, &op_ifunction_vol));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "q dot", ceed_data->elem_restr_q, ceed_data->basis_q, user->q_dot_ceed));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ifunction_vol, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data));
+
+    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ifunction_vol));
   }
 
-  CeedQFunction qf_ijacobian_vol = NULL;
   if (problem->apply_vol_ijacobian.qfunction) {
+    CeedQFunction qf_ijacobian_vol;
+
     PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ijacobian.qfunction, problem->apply_vol_ijacobian.qfunction_loc,
                                                     &qf_ijacobian_vol));
     PetscCallCeed(ceed, CeedQFunctionSetContext(qf_ijacobian_vol, problem->apply_vol_ijacobian.qfunction_context));
     PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(qf_ijacobian_vol, 0));
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "dq", num_comp_q, CEED_EVAL_INTERP));
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "Grad_dq", num_comp_q * dim, CEED_EVAL_GRAD));
-    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE));
+    PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "qdata", problem->q_data_size_vol, CEED_EVAL_NONE));
     PetscCallCeed(ceed, CeedQFunctionAddInput(qf_ijacobian_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE));
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ijacobian_vol, "v", num_comp_q, CEED_EVAL_INTERP));
     PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_ijacobian_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD));
-  }
-
-  // ---------------------------------------------------------------------------
-  // Element coordinates
-  // ---------------------------------------------------------------------------
-  // -- Create CEED vector
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_x, &ceed_data->x_coord, NULL));
-
-  // -- Copy PETSc vector in CEED vector
-  Vec X_loc;
-  {
-    DM cdm;
-    PetscCall(DMGetCellCoordinateDM(dm, &cdm));
-    if (cdm) {
-      PetscCall(DMGetCellCoordinatesLocal(dm, &X_loc));
-    } else {
-      PetscCall(DMGetCoordinatesLocal(dm, &X_loc));
-    }
-  }
-  PetscCall(VecScale(X_loc, problem->dm_scale));
-  PetscCall(VecCopyPetscToCeed(X_loc, ceed_data->x_coord));
-
-  // -----------------------------------------------------------------------------
-  // CEED vectors
-  // -----------------------------------------------------------------------------
-  // -- Create CEED vector for geometric data
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(ceed_data->elem_restr_qd_i, &ceed_data->q_data, NULL));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_jd_i, &jac_data, NULL));
-
-  // -----------------------------------------------------------------------------
-  // CEED Operators
-  // -----------------------------------------------------------------------------
-  // -- Create CEED operator for quadrature data
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_setup_vol, NULL, NULL, &ceed_data->op_setup_vol));
-  PetscCallCeed(ceed, CeedOperatorSetField(ceed_data->op_setup_vol, "dx", ceed_data->elem_restr_x, ceed_data->basis_x, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(ceed_data->op_setup_vol, "weight", CEED_ELEMRESTRICTION_NONE, ceed_data->basis_x, CEED_VECTOR_NONE));
-  PetscCallCeed(ceed, CeedOperatorSetField(ceed_data->op_setup_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-  // -- Create CEED operator for ICs
-  CeedOperator op_ics;
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_ics, NULL, NULL, &op_ics));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "x", ceed_data->elem_restr_x, ceed_data->basis_xc, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "dx", ceed_data->elem_restr_x, ceed_data->basis_xc, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_ics, "q0", ceed_data->elem_restr_q, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_ics, "evaluation time", &user->phys->ics_time_label));
-  PetscCall(OperatorApplyContextCreate(NULL, dm, user->ceed, op_ics, ceed_data->x_coord, NULL, NULL, user->Q_loc, &ceed_data->op_ics_ctx));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_ics));
-
-  // Create CEED operator for RHS
-  if (ceed_data->qf_rhs_vol) {
-    CeedOperator op;
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_rhs_vol, NULL, NULL, &op));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    user->op_rhs_vol = op;
-  }
 
-  // -- CEED operator for IFunction
-  if (ceed_data->qf_ifunction_vol) {
-    CeedOperator op;
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_ifunction_vol, NULL, NULL, &op));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "q dot", ceed_data->elem_restr_q, ceed_data->basis_q, user->q_dot_ceed));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data));
-
-    user->op_ifunction_vol = op;
-  }
+    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ijacobian_vol, NULL, NULL, &op_ijacobian_vol));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "Grad_dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
+    PetscCallCeed(ceed, CeedOperatorSetField(op_ijacobian_vol, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
 
-  CeedOperator op_ijacobian_vol = NULL;
-  if (qf_ijacobian_vol) {
-    CeedOperator op;
-    PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_ijacobian_vol, NULL, NULL, &op));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_NONE, ceed_data->q_data));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "jac_data", elem_restr_jd_i, CEED_BASIS_NONE, jac_data));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    PetscCallCeed(ceed, CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE));
-    op_ijacobian_vol = op;
     PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_ijacobian_vol));
   }
 
-  // *****************************************************************************
-  // Set up CEED objects for the exterior domain (surface)
-  // *****************************************************************************
-  height                = 1;
-  CeedInt       dim_sur = dim - height, P_sur = app_ctx->degree + 1, Q_sur = P_sur + app_ctx->q_extra;
-  const CeedInt q_data_size_sur = problem->q_data_size_sur, jac_data_size_sur = user->phys->implicit ? problem->jac_data_size_sur : 0;
-
-  // -----------------------------------------------------------------------------
-  // CEED Bases
-  // -----------------------------------------------------------------------------
-
-  DMLabel  label   = 0;
-  PetscInt face_id = 0;
-  PetscInt field   = 0;  // Still want the normal, default field
-  PetscCall(CreateBasisFromPlex(ceed, dm, label, face_id, height, field, &ceed_data->basis_q_sur));
-  PetscCall(CreateBasisFromPlex(ceed, dm_coord, label, face_id, height, field, &ceed_data->basis_x_sur));
-
-  // -----------------------------------------------------------------------------
-  // CEED QFunctions
-  // -----------------------------------------------------------------------------
-  // -- Create QFunction for quadrature data
-  PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, problem->setup_sur.qfunction, problem->setup_sur.qfunction_loc, &ceed_data->qf_setup_sur));
-  if (problem->setup_sur.qfunction_context) {
-    PetscCallCeed(ceed, CeedQFunctionSetContext(ceed_data->qf_setup_sur, problem->setup_sur.qfunction_context));
-  }
-  PetscCallCeed(ceed, CeedQFunctionSetUserFlopsEstimate(ceed_data->qf_setup_sur, 0));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_sur, "dx", num_comp_x * dim_sur, CEED_EVAL_GRAD));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(ceed_data->qf_setup_sur, "weight", 1, CEED_EVAL_WEIGHT));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(ceed_data->qf_setup_sur, "surface qdata", q_data_size_sur, CEED_EVAL_NONE));
-
-  PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_inflow,
-                              problem->apply_inflow_jacobian, &ceed_data->qf_apply_inflow, &ceed_data->qf_apply_inflow_jacobian));
-  PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_outflow,
-                              problem->apply_outflow_jacobian, &ceed_data->qf_apply_outflow, &ceed_data->qf_apply_outflow_jacobian));
-  PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_freestream,
-                              problem->apply_freestream_jacobian, &ceed_data->qf_apply_freestream, &ceed_data->qf_apply_freestream_jacobian));
-  PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_slip,
-                              problem->apply_slip_jacobian, &ceed_data->qf_apply_slip, &ceed_data->qf_apply_slip_jacobian));
-
-  // *****************************************************************************
-  // CEED Operator Apply
-  // *****************************************************************************
-  // -- Apply CEED Operator for the geometric data
-  PetscCallCeed(ceed, CeedOperatorApply(ceed_data->op_setup_vol, ceed_data->x_coord, ceed_data->q_data, CEED_REQUEST_IMMEDIATE));
-
   // -- Create and apply CEED Composite Operator for the entire domain
   if (!user->phys->implicit) {  // RHS
     CeedOperator op_rhs;
-    PetscCall(CreateOperatorForDomain(ceed, dm, bc, ceed_data, user->phys, user->op_rhs_vol, NULL, height, P_sur, Q_sur, q_data_size_sur, 0, &op_rhs,
-                                      NULL));
+
+    PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &op_rhs));
+    PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_rhs, op_rhs_vol));
+    PetscCall(AddBCSubOperators(user, ceed, dm, bc, problem, ceed_data, op_rhs, NULL));
+
     PetscCall(OperatorApplyContextCreate(dm, dm, ceed, op_rhs, user->q_ceed, user->g_ceed, user->Q_loc, NULL, &user->op_rhs_ctx));
+
+    // ----- Get Context Labels for Operator
+    PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_rhs, "solution time", &user->phys->solution_time_label));
+    PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_rhs, "timestep size", &user->phys->timestep_size_label));
+
     PetscCallCeed(ceed, CeedOperatorDestroy(&op_rhs));
     PetscCall(CreateKSPMass(user, problem));
-    PetscCheck(app_ctx->sgs_model_type == SGS_MODEL_NONE, user->comm, PETSC_ERR_SUP, "SGS modeling not implemented for explicit timestepping");
   } else {  // IFunction
     CeedOperator op_ijacobian = NULL;
 
-    PetscCall(CreateOperatorForDomain(ceed, dm, bc, ceed_data, user->phys, user->op_ifunction_vol, op_ijacobian_vol, height, P_sur, Q_sur,
-                                      q_data_size_sur, jac_data_size_sur, &user->op_ifunction, op_ijacobian_vol ? &op_ijacobian : NULL));
+    // Create Composite Operaters
+    PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &user->op_ifunction));
+    PetscCallCeed(ceed, CeedOperatorCompositeAddSub(user->op_ifunction, op_ifunction_vol));
+    if (op_ijacobian_vol) {
+      PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &op_ijacobian));
+      PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_ijacobian, op_ijacobian_vol));
+    }
+    PetscCall(AddBCSubOperators(user, ceed, dm, bc, problem, ceed_data, user->op_ifunction, op_ijacobian));
+
+    // ----- Get Context Labels for Operator
+    PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(user->op_ifunction, "solution time", &user->phys->solution_time_label));
+    PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(user->op_ifunction, "timestep size", &user->phys->timestep_size_label));
+
     if (op_ijacobian) {
-      PetscCall(MatCeedCreate(user->dm, user->dm, op_ijacobian, NULL, &user->mat_ijacobian));
+      PetscCall(MatCreateCeed(user->dm, user->dm, op_ijacobian, NULL, &user->mat_ijacobian));
       PetscCall(MatCeedSetLocalVectors(user->mat_ijacobian, user->Q_dot_loc, NULL));
-      PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_ijacobian, "ijacobian time shift", &user->phys->ijacobian_time_shift_label));
       PetscCallCeed(ceed, CeedOperatorDestroy(&op_ijacobian));
     }
-    if (app_ctx->sgs_model_type == SGS_MODEL_DATA_DRIVEN) PetscCall(SgsDDSetup(ceed, user, ceed_data, problem));
   }
 
   if (problem->use_strong_bc_ceed) PetscCall(SetupStrongBC_Ceed(ceed, ceed_data, dm, user, problem, bc));
   if (app_ctx->turb_spanstats_enable) PetscCall(TurbulenceStatisticsSetup(ceed, user, ceed_data, problem));
   if (app_ctx->diff_filter_monitor && !user->diff_filter) PetscCall(DifferentialFilterSetup(ceed, user, ceed_data, problem));
-  if (app_ctx->sgs_train_enable) PetscCall(SGS_DD_TrainingSetup(ceed, user, ceed_data, problem));
 
+  PetscCallCeed(ceed, CeedVectorDestroy(&jac_data));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_jd_i));
   PetscCallCeed(ceed, CeedOperatorDestroy(&op_ijacobian_vol));
-  PetscCallCeed(ceed, CeedVectorDestroy(&jac_data));
+  PetscCallCeed(ceed, CeedOperatorDestroy(&op_ifunction_vol));
+  PetscCallCeed(ceed, CeedOperatorDestroy(&op_rhs_vol));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
diff --git a/examples/fluids/src/setupts.c b/examples/fluids/src/setupts.c
index 2ef3de7203..6ebaa66b39 100644
--- a/examples/fluids/src/setupts.c
+++ b/examples/fluids/src/setupts.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -140,10 +140,6 @@ PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, void *u
   PetscCall(VecReadCeedToPetsc(user->q_dot_ceed, q_dot_mem_type, Q_dot_loc));
   PetscCall(VecCeedToPetsc(user->g_ceed, g_mem_type, G_loc));
 
-  if (user->app_ctx->sgs_model_type == SGS_MODEL_DATA_DRIVEN) {
-    PetscCall(SgsDDApplyIFunction(user, Q_loc, G_loc));
-  }
-
   // Local-to-Global
   PetscCall(VecZeroEntries(G));
   PetscCall(DMLocalToGlobal(user->dm, G_loc, ADD_VALUES, G));
@@ -155,7 +151,6 @@ PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, void *u
 
 PetscErrorCode FormIJacobian_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, PetscReal shift, Mat J, Mat J_pre, void *user_data) {
   User      user = *(User *)user_data;
-  Ceed      ceed = user->ceed;
   PetscBool J_is_matceed, J_is_mffd, J_pre_is_matceed, J_pre_is_mffd;
 
   PetscFunctionBeginUser;
@@ -163,12 +158,8 @@ PetscErrorCode FormIJacobian_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, PetscReal
   PetscCall(PetscObjectTypeCompare((PetscObject)J, MATCEED, &J_is_matceed));
   PetscCall(PetscObjectTypeCompare((PetscObject)J_pre, MATMFFD, &J_pre_is_mffd));
   PetscCall(PetscObjectTypeCompare((PetscObject)J_pre, MATCEED, &J_pre_is_matceed));
-  if (user->phys->ijacobian_time_shift_label) {
-    CeedOperator op_ijacobian;
 
-    PetscCall(MatCeedGetCeedOperators(user->mat_ijacobian, &op_ijacobian, NULL));
-    PetscCallCeed(ceed, CeedOperatorSetContextDouble(op_ijacobian, user->phys->ijacobian_time_shift_label, &shift));
-  }
+  PetscCall(MatCeedSetContextReal(user->mat_ijacobian, "ijacobian time shift", shift));
 
   if (J_is_matceed || J_is_mffd) {
     PetscCall(MatAssemblyBegin(J, MAT_FINAL_ASSEMBLY));
@@ -216,8 +207,8 @@ PetscErrorCode WriteOutput(User user, Vec Q, PetscInt step_no, PetscScalar time)
       PetscCall(VecZeroEntries(Q_refined_loc));
       PetscCall(DMGlobalToLocal(user->dm_viz, Q_refined, INSERT_VALUES, Q_refined_loc));
 
-      PetscCall(
-          PetscSNPrintf(file_path_refined, sizeof file_path_refined, "%s/nsrefined-%03" PetscInt_FMT ".vtu", user->app_ctx->output_dir, step_no));
+      PetscCall(PetscSNPrintf(file_path_refined, sizeof file_path_refined, "%s/nsrefined-%03" PetscInt_FMT ".vtu", user->app_ctx->output_dir,
+                              step_no));
 
       PetscCall(PetscViewerVTKOpen(PetscObjectComm((PetscObject)Q_refined), file_path_refined, FILE_MODE_WRITE, &viewer_refined));
       PetscCall(VecView(Q_refined_loc, viewer_refined));
@@ -303,7 +294,7 @@ PetscErrorCode TSMonitor_NS(TS ts, PetscInt step_no, PetscReal time, Vec Q, void
 }
 
 // TS: Create, setup, and solve
-PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Vec *Q, PetscScalar *f_time, TS *ts) {
+PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, ProblemData problem, Vec *Q, PetscScalar *f_time, TS *ts) {
   MPI_Comm    comm = user->comm;
   TSAdapt     adapt;
   PetscScalar final_time;
@@ -378,10 +369,7 @@ PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Vec *Q
   }
   if (app_ctx->diff_filter_monitor) PetscCall(TSMonitorSet(*ts, TSMonitor_DifferentialFilter, user, NULL));
 
-  if (app_ctx->sgs_train_enable) {
-    PetscCall(TSMonitorSet(*ts, TSMonitor_SGS_DD_Training, user, NULL));
-    PetscCall(TSSetPostStep(*ts, TSPostStep_SGS_DD_Training));
-  }
+  if (app_ctx->test_type == TESTTYPE_NONE) PetscCall(PrintRunInfo(user, user->phys, problem, *ts));
   // Solve
   PetscReal start_time;
   PetscInt  start_step;
diff --git a/examples/fluids/src/smartsim/sgs_dd_training.c b/examples/fluids/src/smartsim/sgs_dd_training.c
deleted file mode 100644
index c3ff2ac43b..0000000000
--- a/examples/fluids/src/smartsim/sgs_dd_training.c
+++ /dev/null
@@ -1,390 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../../qfunctions/sgs_dd_training.h"
-
-#include <petscdmplex.h>
-
-#include "../../include/smartsim.h"
-#include "../../navierstokes.h"
-
-typedef struct {
-  CeedElemRestriction  elem_restr_grid_aniso;
-  CeedVector           grid_aniso_ceed;
-  CeedQFunctionContext sgs_dd_train_qfctx;
-} *SGS_DD_TrainingSetupData;
-
-static PetscErrorCode SGS_DD_TrainingSetupDataDestroy(SGS_DD_TrainingSetupData sgs_dd_train_setup_data) {
-  Ceed ceed;
-
-  PetscFunctionBeginUser;
-  PetscCall(CeedElemRestrictionGetCeed(sgs_dd_train_setup_data->elem_restr_grid_aniso, &ceed));
-
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&sgs_dd_train_setup_data->elem_restr_grid_aniso));
-  PetscCallCeed(ceed, CeedVectorDestroy(&sgs_dd_train_setup_data->grid_aniso_ceed));
-  PetscCallCeed(ceed, CeedQFunctionContextDestroy(&sgs_dd_train_setup_data->sgs_dd_train_qfctx));
-  PetscCall(PetscFree(sgs_dd_train_setup_data));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-// @brief Create DM for storing data-drive SGS model inputs
-static PetscErrorCode SGS_DD_TrainingCreateDM(DM dm_source, DM *dm_dd_training, PetscInt degree, PetscInt q_extra, PetscInt *num_components) {
-  PetscSection section;
-
-  PetscFunctionBeginUser;
-  *num_components = 12;
-
-  PetscCall(DMClone(dm_source, dm_dd_training));
-  PetscCall(PetscObjectSetName((PetscObject)*dm_dd_training, "Data-Driven SGS Training Data"));
-
-  PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, degree, 1, q_extra, 1, num_components, *dm_dd_training));
-
-  PetscCall(DMGetLocalSection(*dm_dd_training, &section));
-  PetscCall(PetscSectionSetFieldName(section, 0, "Data-Driven SGS Training Data"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 0, "SGSInput1"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 1, "SGSInput2"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 2, "SGSInput3"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 3, "SGSInput4"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 4, "SGSInput5"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 5, "SGSInput6"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 6, "FilteredSGSXX"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 7, "FilteredSGSYY"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 8, "FilteredSGSZZ"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 9, "FilteredSGSYZ"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 10, "FilteredSGSXZ"));
-  PetscCall(PetscSectionSetComponentName(section, 0, 11, "FilteredSGSXY"));
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-// @brief Create CeedOperator to calculate training data for data-drive SGS model at nodes
-static PetscErrorCode SetupTrainingDataCalculation(Ceed ceed, User user, CeedData ceed_data, ProblemData problem,
-                                                   SGS_DD_TrainingSetupData sgs_dd_train_setup_data) {
-  SGS_DD_TrainingData sgs_dd_train = user->sgs_dd_train;
-  CeedQFunction       qf_sgs_dd_train;
-  CeedOperator        op_sgs_dd_train;
-  CeedInt             num_comp_grad_velo, num_comp_grid_aniso;
-  CeedVector          inv_multiplicity, filtered_fields;
-  CeedElemRestriction elem_restr_inv_multiplicity, elem_restr_grad_velo, elem_restr_sgs_train;
-  DMLabel             domain_label = NULL;
-  PetscInt            label_value = 0, height = 0, dm_field = 0;
-
-  PetscFunctionBeginUser;
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(sgs_dd_train_setup_data->elem_restr_grid_aniso, &num_comp_grid_aniso));
-
-  PetscCall(DMPlexCeedElemRestrictionCreate(ceed, sgs_dd_train->dm_dd_training, domain_label, label_value, height, dm_field, &elem_restr_sgs_train));
-  PetscCall(GetInverseMultiplicity(ceed, sgs_dd_train->dm_dd_training, domain_label, label_value, height, dm_field, PETSC_TRUE,
-                                   &elem_restr_inv_multiplicity, &inv_multiplicity));
-
-  CeedElemRestriction elem_restr_filtered_state;
-  CeedInt             num_comp_filtered_state;
-  {  // -- Setup filtered velocity gradient projection
-    CeedBasis         basis_filtered_state;
-    CeedOperatorField op_field;
-    PetscCallCeed(ceed, CeedOperatorGetFieldByName(user->diff_filter->op_rhs_ctx->op, "v0", &op_field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_filtered_state));
-    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_filtered_state, &num_comp_filtered_state));
-    PetscCallCeed(ceed, CeedOperatorFieldGetBasis(op_field, &basis_filtered_state));
-    PetscCall(VelocityGradientProjectionSetup(ceed, user, ceed_data, problem, STATEVAR_PRIMITIVE, elem_restr_filtered_state, basis_filtered_state,
-                                              &sgs_dd_train->filtered_grad_velo_proj));
-    // Get velocity gradient information
-    PetscCallCeed(ceed, CeedOperatorGetFieldByName(sgs_dd_train->filtered_grad_velo_proj->l2_rhs_ctx->op, "velocity gradient", &op_field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_grad_velo));
-    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_grad_velo, &num_comp_grad_velo));
-  }
-
-  CeedElemRestriction elem_restr_filtered_velo_prod;
-  CeedInt             num_comp_filtered_velo_prod;
-  {  // Get filtered velocity product information
-    CeedOperatorField op_field;
-    PetscCallCeed(ceed, CeedOperatorGetFieldByName(user->diff_filter->op_rhs_ctx->op, "v1", &op_field));
-    PetscCallCeed(ceed, CeedOperatorFieldGetElemRestriction(op_field, &elem_restr_filtered_velo_prod));
-    PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(elem_restr_filtered_velo_prod, &num_comp_filtered_velo_prod));
-  }
-
-  // -- Create operator for generating training data at nodes
-  // Differential Filter only provides filtered primitive variables
-  PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ComputeSGS_DDAnisotropicTrainingDataNodal_Prim,
-                                                  ComputeSGS_DDAnisotropicTrainingDataNodal_Prim_loc, &qf_sgs_dd_train));
-
-  PetscCallCeed(ceed, CeedQFunctionSetContext(qf_sgs_dd_train, sgs_dd_train_setup_data->sgs_dd_train_qfctx));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "q", num_comp_filtered_state, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "velocity product", num_comp_filtered_velo_prod, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "gradient velocity", num_comp_grad_velo, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "anisotropy tensor", num_comp_grid_aniso, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddInput(qf_sgs_dd_train, "inverse multiplicity", 1, CEED_EVAL_NONE));
-  PetscCallCeed(ceed, CeedQFunctionAddOutput(qf_sgs_dd_train, "training data", sgs_dd_train->num_comp_dd_inputs, CEED_EVAL_NONE));
-
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector(elem_restr_filtered_state, &filtered_fields, NULL));
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_sgs_dd_train, NULL, NULL, &op_sgs_dd_train));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "q", elem_restr_filtered_state, CEED_BASIS_NONE, filtered_fields));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "velocity product", elem_restr_filtered_velo_prod, CEED_BASIS_NONE, filtered_fields));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "gradient velocity", elem_restr_grad_velo, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "anisotropy tensor", sgs_dd_train_setup_data->elem_restr_grid_aniso, CEED_BASIS_NONE,
-                                           sgs_dd_train_setup_data->grid_aniso_ceed));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "inverse multiplicity", elem_restr_inv_multiplicity, CEED_BASIS_NONE, inv_multiplicity));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_sgs_dd_train, "training data", elem_restr_sgs_train, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-
-  PetscCall(OperatorApplyContextCreate(sgs_dd_train->filtered_grad_velo_proj->dm, sgs_dd_train->dm_dd_training, ceed, op_sgs_dd_train, NULL, NULL,
-                                       NULL, NULL, &sgs_dd_train->op_training_data_calc_ctx));
-
-  PetscCallCeed(ceed, CeedVectorDestroy(&inv_multiplicity));
-  PetscCallCeed(ceed, CeedVectorDestroy(&filtered_fields));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_inv_multiplicity));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_sgs_dd_train));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_sgs_dd_train));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SGS_DD_TrainingSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem) {
-  SGS_DDTrainingContext    sgsdd_train_qfctx;
-  SGS_DD_TrainingSetupData sgs_dd_train_setup_data;
-
-  PetscFunctionBeginUser;
-  if (!user->diff_filter) PetscCall(DifferentialFilterSetup(ceed, user, ceed_data, problem));
-  if (!user->smartsim) PetscCall(SmartSimSetup(user));
-
-  PetscCall(PetscNew(&sgsdd_train_qfctx));
-  PetscCall(PetscNew(&sgs_dd_train_setup_data));
-  PetscCall(PetscNew(&user->sgs_dd_train));
-  SGS_DD_TrainingData sgs_dd_train = user->sgs_dd_train;
-
-  sgs_dd_train->overwrite_training_data = PETSC_TRUE;
-  sgs_dd_train->write_data_interval     = 1;
-  sgs_dd_train->num_filter_widths       = sizeof(sgs_dd_train->filter_widths) / sizeof(sgs_dd_train->filter_widths[0]);
-  PetscOptionsBegin(user->comm, NULL, "SGS Data-Driven Training Options", NULL);
-  PetscCall(PetscOptionsInt("-sgs_train_write_data_interval", "Number of timesteps between writing data into database", NULL,
-                            sgs_dd_train->write_data_interval, &sgs_dd_train->write_data_interval, NULL));
-  PetscCall(PetscOptionsBool("-sgs_train_overwrite_data", "Overwrite old training data in the database", NULL, sgs_dd_train->overwrite_training_data,
-                             &sgs_dd_train->overwrite_training_data, NULL));
-  PetscCall(PetscOptionsRealArray("-sgs_train_filter_width_scales", "Scales of each filter width put into training database", NULL,
-                                  sgs_dd_train->filter_widths, &sgs_dd_train->num_filter_widths, NULL));
-  PetscOptionsEnd();
-
-  // -- Create DM for storing training data
-  PetscCall(SGS_DD_TrainingCreateDM(user->dm, &sgs_dd_train->dm_dd_training, user->app_ctx->degree, user->app_ctx->q_extra,
-                                    &sgs_dd_train->num_comp_dd_inputs));
-
-  {  // -- Create QFunction Context
-    NewtonianIdealGasContext gas;
-    PetscCallCeed(ceed, CeedQFunctionContextGetDataRead(problem->apply_vol_ifunction.qfunction_context, CEED_MEM_HOST, &gas));
-    sgsdd_train_qfctx->gas = *gas;
-    PetscCallCeed(ceed, CeedQFunctionContextRestoreDataRead(problem->apply_vol_ifunction.qfunction_context, &gas));
-    PetscCallCeed(ceed, CeedQFunctionContextCreate(user->ceed, &sgs_dd_train_setup_data->sgs_dd_train_qfctx));
-    PetscCallCeed(ceed, CeedQFunctionContextSetData(sgs_dd_train_setup_data->sgs_dd_train_qfctx, CEED_MEM_HOST, CEED_USE_POINTER,
-                                                    sizeof(*sgsdd_train_qfctx), sgsdd_train_qfctx));
-    PetscCallCeed(ceed, CeedQFunctionContextSetDataDestroy(sgs_dd_train_setup_data->sgs_dd_train_qfctx, CEED_MEM_HOST, FreeContextPetsc));
-  }
-
-  {  // -- Send training data array info to SmartRedis database
-    PetscMPIInt  rank, num_ranks;
-    SmartSimData smartsim = user->smartsim;
-    PetscCallMPI(MPI_Comm_rank(user->comm, &rank));
-    PetscCallMPI(MPI_Comm_size(user->comm, &num_ranks));
-
-    {
-      PetscSection global_section;
-      PetscInt     num_dofs, num_comps, local_min_max[2] = {0.}, global_min_max[2] = {0.};
-
-      PetscCall(DMGetGlobalSection(sgs_dd_train->dm_dd_training, &global_section));
-      PetscCall(DMGetGlobalVectorInfo(sgs_dd_train->dm_dd_training, &num_dofs, NULL, NULL));
-      PetscCall(PetscSectionGetFieldComponents(global_section, 0, &num_comps));
-      local_min_max[0] = num_dofs;
-      PetscCall(PetscGlobalMinMaxInt(user->comm, local_min_max, global_min_max));
-
-      sgs_dd_train->training_data_array_dims[0] = global_min_max[0] / num_comps;
-      sgs_dd_train->training_data_array_dims[1] = num_comps;
-    }
-
-    if (rank % smartsim->collocated_database_num_ranks == 0) {
-      {  // Communicate info on simulation size
-        const char tensor_name[]  = "sizeInfo";
-        size_t     array_info_dim = 6;
-        PetscInt64 array_info[6] = {0}, num_features = 6;
-
-        array_info[0] = sgs_dd_train->training_data_array_dims[0];
-        array_info[1] = sgs_dd_train->training_data_array_dims[1];
-        array_info[2] = num_features;
-        array_info[3] = num_ranks;
-        array_info[4] = smartsim->collocated_database_num_ranks;
-        array_info[5] = rank;
-
-        PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-        PetscCallSmartRedis(
-            put_tensor(smartsim->client, tensor_name, strlen(tensor_name), array_info, &array_info_dim, 1, SRTensorTypeInt64, SRMemLayoutContiguous));
-        PetscCall(SmartRedisVerifyPutTensor(smartsim->client, tensor_name, strlen(tensor_name)));
-        PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-      }
-
-      {  // Send array that communicates if tensors are overwritten in database
-        const char tensor_name[]       = "tensor-ow";
-        PetscInt64 tensor_overwrite[2] = {sgs_dd_train->overwrite_training_data};
-        size_t     dim_2[1]            = {2};
-
-        PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-        PetscCallSmartRedis(
-            put_tensor(smartsim->client, tensor_name, strlen(tensor_name), tensor_overwrite, dim_2, 1, SRTensorTypeInt64, SRMemLayoutContiguous));
-        PetscCall(SmartRedisVerifyPutTensor(smartsim->client, tensor_name, strlen(tensor_name)));
-        PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-      }
-
-      {  // Communicate number of filter widths used
-        const char tensor_name[]     = "num_filter_widths";
-        PetscInt64 num_filter_widths = sgs_dd_train->num_filter_widths;
-        size_t     dim_2             = 1;
-
-        PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-        PetscCallSmartRedis(
-            put_tensor(smartsim->client, tensor_name, strlen(tensor_name), &num_filter_widths, &dim_2, 1, SRTensorTypeInt64, SRMemLayoutContiguous));
-        PetscCall(SmartRedisVerifyPutTensor(smartsim->client, tensor_name, strlen(tensor_name)));
-        PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-      }
-    }
-  }
-
-  // -- Compute and store anisotropy tensor
-  PetscCall(GridAnisotropyTensorProjectionSetupApply(ceed, user, ceed_data, &sgs_dd_train_setup_data->elem_restr_grid_aniso,
-                                                     &sgs_dd_train_setup_data->grid_aniso_ceed));
-
-  // -- Create Nodal Evaluation Operator
-  PetscCall(SetupTrainingDataCalculation(ceed, user, ceed_data, problem, sgs_dd_train_setup_data));
-
-  PetscCall(SGS_DD_TrainingSetupDataDestroy(sgs_dd_train_setup_data));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode TSMonitor_SGS_DD_Training(TS ts, PetscInt step_num, PetscReal solution_time, Vec Q, void *ctx) {
-  User                user         = (User)ctx;
-  Ceed                ceed         = user->ceed;
-  SGS_DD_TrainingData sgs_dd_train = user->sgs_dd_train;
-  SmartSimData        smartsim     = user->smartsim;
-  Vec                 TrainingData;
-  PetscMPIInt         rank;
-
-  PetscFunctionBeginUser;
-
-  PetscCallMPI(MPI_Comm_rank(user->comm, &rank));
-
-  if (step_num % sgs_dd_train->write_data_interval != 0) PetscFunctionReturn(PETSC_SUCCESS);
-  PetscCall(DMGetGlobalVector(sgs_dd_train->dm_dd_training, &TrainingData));
-
-  for (PetscInt filter_index = 0; filter_index < sgs_dd_train->num_filter_widths; filter_index++) {
-    PetscCall(PetscLogEventBegin(FLUIDS_TrainDataCompute, 0, 0, 0, 0));
-    {  // -- Compute and assemble training data
-      Vec          FilteredVelocityGradient, FilteredFields, FilteredFields_loc;
-      PetscMemType filtered_fields_mem_type;
-      CeedVector   filtered_fields;
-
-      {  // Set filter width for the current solve
-        double       filter_width_scaling[3];
-        CeedOperator op_mat;
-        Mat          A_mat;
-
-        for (int j = 0; j < 3; j++) filter_width_scaling[j] = sgs_dd_train->filter_widths[filter_index];
-        PetscCall(KSPGetOperators(user->diff_filter->ksp, &A_mat, NULL));
-        PetscCall(MatCeedGetCeedOperators(A_mat, &op_mat, NULL));
-        PetscCall(CeedOperatorSetContextDouble(op_mat, user->diff_filter->filter_width_scaling_label, filter_width_scaling));
-      }
-
-      PetscCall(DMGetGlobalVector(user->diff_filter->dm_filter, &FilteredFields));
-      PetscCall(DMGetLocalVector(user->diff_filter->dm_filter, &FilteredFields_loc));
-
-      PetscCall(DifferentialFilterApply(user, solution_time, Q, FilteredFields));
-      PetscCall(DMGlobalToLocal(user->diff_filter->dm_filter, FilteredFields, INSERT_VALUES, FilteredFields_loc));
-
-      PetscCall(DMGetGlobalVector(sgs_dd_train->filtered_grad_velo_proj->dm, &FilteredVelocityGradient));
-      PetscCall(VelocityGradientProjectionApply(sgs_dd_train->filtered_grad_velo_proj, FilteredFields_loc, FilteredVelocityGradient));
-
-      {
-        CeedOperatorField op_field;
-
-        PetscCallCeed(ceed, CeedOperatorGetFieldByName(sgs_dd_train->op_training_data_calc_ctx->op, "q", &op_field));
-        PetscCallCeed(ceed, CeedOperatorFieldGetVector(op_field, &filtered_fields));
-      }
-
-      PetscCall(VecPetscToCeed(FilteredFields_loc, &filtered_fields_mem_type, filtered_fields));  // filtered_fields is an implicit input
-      PetscCall(ApplyCeedOperatorGlobalToGlobal(FilteredVelocityGradient, TrainingData, sgs_dd_train->op_training_data_calc_ctx));
-      PetscCall(VecCeedToPetsc(filtered_fields, filtered_fields_mem_type, FilteredFields_loc));
-
-      PetscCall(DMRestoreGlobalVector(sgs_dd_train->filtered_grad_velo_proj->dm, &FilteredVelocityGradient));
-      PetscCall(DMRestoreGlobalVector(user->diff_filter->dm_filter, &FilteredFields));
-      PetscCall(DMRestoreLocalVector(user->diff_filter->dm_filter, &FilteredFields_loc));
-    }
-    PetscCall(PetscLogEventEnd(FLUIDS_TrainDataCompute, 0, 0, 0, 0));
-
-    {  // -- Send training data to SmartSim
-      char   array_key[PETSC_MAX_PATH_LEN];
-      size_t array_key_len;
-
-      if (sgs_dd_train->overwrite_training_data) {
-        PetscCall(PetscSNPrintf(array_key, sizeof array_key, "%s.%" PetscInt_FMT, smartsim->rank_id_name, filter_index));
-      } else {
-        PetscCall(PetscSNPrintf(array_key, sizeof array_key, "%s.%" PetscInt_FMT "%" PetscInt_FMT, smartsim->rank_id_name, step_num, filter_index));
-      }
-      PetscCall(PetscStrlen(array_key, &array_key_len));
-
-      {
-        const PetscScalar *training_data;
-        PetscCall(VecGetArrayRead(TrainingData, &training_data));
-        PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Train, 0, 0, 0, 0));
-        PetscCallSmartRedis(put_tensor(smartsim->client, array_key, array_key_len, (void *)training_data, sgs_dd_train->training_data_array_dims, 2,
-                                       SRTensorTypeDouble, SRMemLayoutContiguous));
-        PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Train, 0, 0, 0, 0));
-        PetscCall(VecRestoreArrayRead(TrainingData, &training_data));
-      }
-    }
-  }
-
-  if (rank % smartsim->collocated_database_num_ranks == 0) {
-    const char tensor_name[] = "step";
-    size_t     dim_2[1]      = {2};
-    PetscInt64 step_array[2] = {step_num, step_num};
-
-    PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-    PetscCallSmartRedis(
-        put_tensor(smartsim->client, tensor_name, strlen(tensor_name), step_array, dim_2, 1, SRTensorTypeInt64, SRMemLayoutContiguous));
-    PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-  }
-
-  PetscCall(DMRestoreGlobalVector(user->sgs_dd_train->dm_dd_training, &TrainingData));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode TSPostStep_SGS_DD_Training(TS ts) {
-  User         user;
-  const char   check_run_key[]   = "check-run";
-  PetscReal    check_run[2]      = {1};
-  const size_t check_run_dims[1] = {2};
-  size_t       check_run_key_size;
-
-  PetscFunctionBeginUser;
-  PetscCall(PetscStrlen(check_run_key, &check_run_key_size));
-  PetscCall(TSGetApplicationContext(ts, &user));
-  SmartSimData smartsim = user->smartsim;
-
-  PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-  PetscCallSmartRedis(
-      unpack_tensor(smartsim->client, check_run_key, check_run_key_size, check_run, check_run_dims, 1, SRTensorTypeDouble, SRMemLayoutContiguous));
-  PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-  if (check_run[0] == 0) {
-    PetscCall(PetscPrintf(user->comm, "-- Simulation stopped by 'check-run' tensor in Redis database\n"));
-    PetscCall(TSSetConvergedReason(ts, TS_CONVERGED_USER));
-  }
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SGS_DD_TrainingDataDestroy(SGS_DD_TrainingData sgs_dd_train) {
-  PetscFunctionBeginUser;
-  if (!sgs_dd_train) PetscFunctionReturn(PETSC_SUCCESS);
-
-  PetscCall(OperatorApplyContextDestroy(sgs_dd_train->op_training_data_calc_ctx));
-  PetscCall(NodalProjectionDataDestroy(sgs_dd_train->filtered_grad_velo_proj));
-  PetscCall(DMDestroy(&sgs_dd_train->dm_dd_training));
-  PetscCall(PetscFree(sgs_dd_train));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
diff --git a/examples/fluids/src/smartsim/smartsim.c b/examples/fluids/src/smartsim/smartsim.c
deleted file mode 100644
index 03ddab9606..0000000000
--- a/examples/fluids/src/smartsim/smartsim.c
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-// Based on the instructions from https://www.craylabs.org/docs/sr_integration.html and PHASTA implementation
-
-#include "../../include/smartsim.h"
-
-#include "../../navierstokes.h"
-
-PetscErrorCode SmartRedisVerifyPutTensor(void *c_client, const char *name, const size_t name_length) {
-  bool does_exist = true;
-
-  PetscFunctionBeginUser;
-  PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-  PetscCallSmartRedis(tensor_exists(c_client, name, name_length, &does_exist));
-  PetscCheck(does_exist, PETSC_COMM_SELF, -1, "Tensor of name '%s' was not written to the database successfully", name);
-  PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SmartSimTrainingSetup(User user) {
-  SmartSimData smartsim = user->smartsim;
-  PetscMPIInt  rank;
-  PetscReal    checkrun[2] = {1};
-  size_t       dim_2[1]    = {2};
-
-  PetscFunctionBeginUser;
-  PetscCallMPI(MPI_Comm_rank(user->comm, &rank));
-
-  if (rank % smartsim->collocated_database_num_ranks == 0) {
-    // -- Send array that communicates when ML is done training
-    PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-    PetscCallSmartRedis(put_tensor(smartsim->client, "check-run", 9, checkrun, dim_2, 1, SRTensorTypeDouble, SRMemLayoutContiguous));
-    PetscCall(SmartRedisVerifyPutTensor(smartsim->client, "check-run", 9));
-    PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Meta, 0, 0, 0, 0));
-  }
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SmartSimSetup(User user) {
-  PetscMPIInt rank;
-  PetscInt    num_orchestrator_nodes = 1;
-
-  PetscFunctionBeginUser;
-  PetscCall(PetscNew(&user->smartsim));
-  SmartSimData smartsim = user->smartsim;
-
-  smartsim->collocated_database_num_ranks = 1;
-  PetscOptionsBegin(user->comm, NULL, "Options for SmartSim integration", NULL);
-  PetscCall(PetscOptionsInt("-smartsim_collocated_database_num_ranks", "Number of ranks per collocated database instance", NULL,
-                            smartsim->collocated_database_num_ranks, &smartsim->collocated_database_num_ranks, NULL));
-  PetscOptionsEnd();
-
-  // Create prefix to be put on tensor names
-  PetscCallMPI(MPI_Comm_rank(user->comm, &rank));
-  PetscCall(PetscSNPrintf(smartsim->rank_id_name, sizeof(smartsim->rank_id_name), "y.%d", rank));
-
-  PetscCall(PetscLogEventBegin(FLUIDS_SmartRedis_Init, 0, 0, 0, 0));
-  PetscCallSmartRedis(SmartRedisCClient(num_orchestrator_nodes != 1, smartsim->rank_id_name, strlen(smartsim->rank_id_name), &smartsim->client));
-  PetscCall(PetscLogEventEnd(FLUIDS_SmartRedis_Init, 0, 0, 0, 0));
-
-  PetscCall(SmartSimTrainingSetup(user));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim) {
-  PetscFunctionBeginUser;
-  if (!smartsim) PetscFunctionReturn(PETSC_SUCCESS);
-
-  PetscCallSmartRedis(DeleteCClient(&smartsim->client));
-  PetscCall(PetscFree(smartsim));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
diff --git a/examples/fluids/src/smartsim_weak.c b/examples/fluids/src/smartsim_weak.c
deleted file mode 100644
index 9c97419a8c..0000000000
--- a/examples/fluids/src/smartsim_weak.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-//
-// @file This creates weak functions for smartsim dependent functions. If the smartsim-dependent functions are actually built, these functions are not
-// linked to the final executable.
-
-#include "../navierstokes.h"
-
-PetscErrorCode SGS_DD_TrainingSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem) __attribute__((weak));
-PetscErrorCode SGS_DD_TrainingSetup(Ceed ceed, User user, CeedData ceed_data, ProblemData problem) {
-  PetscFunctionBeginUser;
-  SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with SMARTREDIS_DIR set to run %s", __func__);
-};
-
-PetscErrorCode TSMonitor_SGS_DD_Training(TS ts, PetscInt step_num, PetscReal solution_time, Vec Q, void *ctx) __attribute__((weak));
-PetscErrorCode TSMonitor_SGS_DD_Training(TS ts, PetscInt step_num, PetscReal solution_time, Vec Q, void *ctx) {
-  PetscFunctionBeginUser;
-  SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with SMARTREDIS_DIR set to run %s", __func__);
-};
-
-PetscErrorCode SGS_DD_TrainingDataDestroy(SGS_DD_TrainingData sgs_dd_train) __attribute__((weak));
-PetscErrorCode SGS_DD_TrainingDataDestroy(SGS_DD_TrainingData sgs_dd_train) {
-  PetscFunctionBeginUser;
-  if (!sgs_dd_train) PetscFunctionReturn(PETSC_SUCCESS);
-  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Warning: SGS_DD_TrainingData struct should not be initialized if SMARTREDIS_DIR isn't set on build..."));
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
-
-PetscErrorCode TSPostStep_SGS_DD_Training(TS ts) __attribute__((weak));
-PetscErrorCode TSPostStep_SGS_DD_Training(TS ts) {
-  PetscFunctionBeginUser;
-  SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Must build with SMARTREDIS_DIR set to run %s", __func__);
-};
-
-PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim) __attribute__((weak));
-PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim) {
-  PetscFunctionBeginUser;
-  if (!smartsim) PetscFunctionReturn(PETSC_SUCCESS);
-  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Warning: SmartSimData struct should not be initialized if SMARTREDIS_DIR isn't set on build..."));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-}
diff --git a/examples/fluids/src/strong_boundary_conditions.c b/examples/fluids/src/strong_boundary_conditions.c
index 9bcc753885..76bee17d39 100644
--- a/examples/fluids/src/strong_boundary_conditions.c
+++ b/examples/fluids/src/strong_boundary_conditions.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -26,8 +26,21 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem
   PetscFunctionBeginUser;
   PetscCall(DMGetLabel(dm, "Face Sets", &domain_label));
 
-  // Basis
-  PetscCallCeed(ceed, CeedBasisCreateProjection(ceed_data->basis_x_sur, ceed_data->basis_q_sur, &basis_x_to_q_sur));
+  {  // Basis
+    CeedBasis basis_x_sur, basis_q_sur;
+    DM        dm_coord;
+
+    PetscCall(DMGetCoordinateDM(dm, &dm_coord));
+    DMLabel  label       = NULL;
+    PetscInt label_value = 0;
+    PetscCall(CreateBasisFromPlex(ceed, dm, label, label_value, height, dm_field, &basis_q_sur));
+    PetscCall(CreateBasisFromPlex(ceed, dm_coord, label, label_value, height, dm_field, &basis_x_sur));
+
+    PetscCallCeed(ceed, CeedBasisCreateProjection(basis_x_sur, basis_q_sur, &basis_x_to_q_sur));
+
+    PetscCallCeed(ceed, CeedBasisDestroy(&basis_q_sur));
+    PetscCallCeed(ceed, CeedBasisDestroy(&basis_x_sur));
+  }
 
   // Setup QFunction
   PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, SetupStrongBC, SetupStrongBC_loc, &qf_setup));
@@ -40,6 +53,7 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem
 
   // Setup STG Setup QFunction
   PetscCall(SetupStrongStg_PreProcessing(ceed, problem, num_comp_x, stg_data_size, dXdx_size, &qf_stgdata));
+  PetscCall(SetupStrongStg_QF(ceed, problem, num_comp_x, num_comp_q, stg_data_size, dXdx_size, &qf_strongbc));
 
   // Compute contribution on each boundary face
   for (CeedInt i = 0; i < bc->num_inflow; i++) {
@@ -79,8 +93,7 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem
 
     PetscCallCeed(ceed, CeedOperatorApply(op_stgdata, CEED_VECTOR_NONE, stg_data, CEED_REQUEST_IMMEDIATE));
 
-    // -- Setup BC QFunctions
-    PetscCall(SetupStrongStg_QF(ceed, problem, num_comp_x, num_comp_q, stg_data_size, dXdx_size, &qf_strongbc));
+    // -- Setup BC Sub Operator
     PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_strongbc, NULL, NULL, &op_strong_bc_sub));
     PetscCallCeed(ceed, CeedOperatorSetName(op_strong_bc_sub, "Strong STG"));
 
@@ -91,7 +104,7 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem
     PetscCallCeed(ceed, CeedOperatorSetField(op_strong_bc_sub, "q", elem_restr_q_sur, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
 
     // -- Add to composite operator
-    PetscCallCeed(ceed, CeedCompositeOperatorAddSub(op_strong_bc, op_strong_bc_sub));
+    PetscCallCeed(ceed, CeedOperatorCompositeAddSub(op_strong_bc, op_strong_bc_sub));
 
     PetscCallCeed(ceed, CeedVectorDestroy(&multiplicity));
     PetscCallCeed(ceed, CeedVectorDestroy(&x_stored));
@@ -104,8 +117,6 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem
     PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_scale));
     PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_stgdata));
     PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_dXdx));
-    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_strongbc));
-    PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_stgdata));
     PetscCallCeed(ceed, CeedOperatorDestroy(&op_strong_bc_sub));
     PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup));
     PetscCallCeed(ceed, CeedOperatorDestroy(&op_stgdata));
@@ -114,6 +125,8 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, Problem
   PetscCallCeed(ceed, CeedOperatorGetContextFieldLabel(op_strong_bc, "solution time", &phys->stg_solution_time_label));
 
   PetscCallCeed(ceed, CeedBasisDestroy(&basis_x_to_q_sur));
+  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_strongbc));
+  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_stgdata));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_setup));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -155,7 +168,7 @@ PetscErrorCode SetupStrongBC_Ceed(Ceed ceed, CeedData ceed_data, DM dm, User use
     PetscCall(DMRestoreGlobalVector(dm, &global_vec));
   }
 
-  PetscCallCeed(ceed, CeedCompositeOperatorCreate(ceed, &op_strong_bc));
+  PetscCallCeed(ceed, CeedOperatorCreateComposite(ceed, &op_strong_bc));
   {
     PetscBool use_strongstg = PETSC_FALSE;
     PetscCall(PetscOptionsGetBool(NULL, NULL, "-stg_strong", &use_strongstg, NULL));
@@ -168,5 +181,6 @@ PetscErrorCode SetupStrongBC_Ceed(Ceed ceed, CeedData ceed_data, DM dm, User use
   PetscCall(OperatorApplyContextCreate(NULL, NULL, ceed, op_strong_bc, CEED_VECTOR_NONE, NULL, NULL, NULL, &user->op_strong_bc_ctx));
 
   PetscCall(PetscObjectComposeFunction((PetscObject)dm, "DMPlexInsertBoundaryValues_C", DMPlexInsertBoundaryValues_StrongBCCeed));
+  PetscCallCeed(ceed, CeedOperatorDestroy(&op_strong_bc));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
diff --git a/examples/fluids/src/turb_spanstats.c b/examples/fluids/src/turb_spanstats.c
index 54ab617afc..942efc38a7 100644
--- a/examples/fluids/src/turb_spanstats.c
+++ b/examples/fluids/src/turb_spanstats.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -21,9 +21,9 @@
 #include "../navierstokes.h"
 
 typedef struct {
-  CeedElemRestriction elem_restr_parent_x, elem_restr_parent_stats, elem_restr_parent_qd, elem_restr_parent_colloc, elem_restr_child_colloc;
+  CeedElemRestriction elem_restr_parent_x, elem_restr_parent_stats, elem_restr_parent_colloc, elem_restr_child_colloc;
   CeedBasis           basis_x, basis_stats;
-  CeedVector          x_coord, q_data;
+  CeedVector          x_coord;
 } *SpanStatsSetupData;
 
 PetscErrorCode CreateStatsDM(User user, ProblemData problem, PetscInt degree) {
@@ -40,7 +40,7 @@ PetscErrorCode CreateStatsDM(User user, ProblemData problem, PetscInt degree) {
 
   // Get spanwise length
   PetscCall(DMGetBoundingBox(user->dm, domain_min, domain_max));
-  user->spanstats.span_width = domain_max[2] - domain_min[1];
+  user->spanstats.span_width = domain_max[2] - domain_min[2];
 
   {  // Get DM from surface
     DM             parent_distributed_dm;
@@ -65,6 +65,7 @@ PetscErrorCode CreateStatsDM(User user, ProblemData problem, PetscInt degree) {
       for (PetscInt i = 0; i < nleaves; i++) {
         PetscCall(DMLabelSetValue(label, ilocal[i], 1));
       }
+      PetscCall(PetscSFDestroy(&inv_isoperiodicface));
     } else {
       PetscCall(DMGetLabel(user->dm, "Face Sets", &label));
     }
@@ -169,7 +170,7 @@ PetscErrorCode GetQuadratureCoords(Ceed ceed, DM dm, CeedElemRestriction elem_re
   PetscCallCeed(ceed, CeedOperatorSetField(op_quad_coords, "input", elem_restr_x, basis_x, x_coords));
   PetscCallCeed(ceed, CeedOperatorSetField(op_quad_coords, "output", elem_restr_qx, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
 
-  PetscCall(CeedOperatorCreateLocalVecs(op_quad_coords, DMReturnVecType(dm), PetscObjectComm((PetscObject)dm), NULL, Qx_coords));
+  PetscCall(CeedOperatorCreateLocalVecs(op_quad_coords, DMReturnVecType(dm), PETSC_COMM_SELF, NULL, Qx_coords));
   PetscCall(OperatorApplyContextCreate(NULL, NULL, ceed, op_quad_coords, CEED_VECTOR_NONE, NULL, NULL, NULL, &op_quad_coords_ctx));
 
   PetscCall(ApplyCeedOperatorLocalToLocal(NULL, *Qx_coords, op_quad_coords_ctx));
@@ -183,7 +184,6 @@ PetscErrorCode GetQuadratureCoords(Ceed ceed, DM dm, CeedElemRestriction elem_re
 
 PetscErrorCode SpanStatsSetupDataCreate(Ceed ceed, User user, CeedData ceed_data, ProblemData problem, SpanStatsSetupData *stats_data) {
   DM       dm = user->spanstats.dm;
-  PetscInt dim;
   CeedInt  num_comp_x, num_comp_stats = user->spanstats.num_comp_stats;
   Vec      X_loc;
   DMLabel  domain_label = NULL;
@@ -192,14 +192,10 @@ PetscErrorCode SpanStatsSetupDataCreate(Ceed ceed, User user, CeedData ceed_data
   PetscFunctionBeginUser;
   PetscCall(PetscNew(stats_data));
 
-  PetscCall(DMGetDimension(dm, &dim));
   PetscCall(DMPlexCeedElemRestrictionCreate(ceed, dm, domain_label, label_value, height, dm_field, &(*stats_data)->elem_restr_parent_stats));
   PetscCall(DMPlexCeedElemRestrictionCoordinateCreate(ceed, dm, domain_label, label_value, height, &(*stats_data)->elem_restr_parent_x));
-  PetscCall(DMPlexCeedElemRestrictionQDataCreate(ceed, dm, domain_label, label_value, height, problem->q_data_size_sur,
-                                                 &(*stats_data)->elem_restr_parent_qd));
   PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents((*stats_data)->elem_restr_parent_x, &num_comp_x));
   PetscCallCeed(ceed, CeedElemRestrictionCreateVector((*stats_data)->elem_restr_parent_x, &(*stats_data)->x_coord, NULL));
-  PetscCallCeed(ceed, CeedElemRestrictionCreateVector((*stats_data)->elem_restr_parent_qd, &(*stats_data)->q_data, NULL));
 
   {
     DM dm_coord;
@@ -210,8 +206,8 @@ PetscErrorCode SpanStatsSetupDataCreate(Ceed ceed, User user, CeedData ceed_data
 
   PetscCall(CreateElemRestrColloc_CompMajor(ceed, num_comp_stats, (*stats_data)->basis_stats, (*stats_data)->elem_restr_parent_stats,
                                             &(*stats_data)->elem_restr_parent_colloc));
-  PetscCall(
-      CreateElemRestrColloc_CompMajor(ceed, num_comp_stats, ceed_data->basis_q, ceed_data->elem_restr_q, &(*stats_data)->elem_restr_child_colloc));
+  PetscCall(CreateElemRestrColloc_CompMajor(ceed, num_comp_stats, ceed_data->basis_q, ceed_data->elem_restr_q,
+                                            &(*stats_data)->elem_restr_child_colloc));
 
   {  // -- Copy DM coordinates into CeedVector
     DM cdm;
@@ -234,7 +230,6 @@ PetscErrorCode SpanStatsSetupDataDestroy(SpanStatsSetupData data) {
   PetscCall(CeedElemRestrictionGetCeed(data->elem_restr_parent_x, &ceed));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_parent_x));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_parent_stats));
-  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_parent_qd));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_parent_colloc));
   PetscCallCeed(ceed, CeedElemRestrictionDestroy(&data->elem_restr_child_colloc));
 
@@ -242,7 +237,8 @@ PetscErrorCode SpanStatsSetupDataDestroy(SpanStatsSetupData data) {
   PetscCallCeed(ceed, CeedBasisDestroy(&data->basis_stats));
 
   PetscCallCeed(ceed, CeedVectorDestroy(&data->x_coord));
-  PetscCallCeed(ceed, CeedVectorDestroy(&data->q_data));
+
+  PetscCheck(CeedDestroy(&ceed) == CEED_ERROR_SUCCESS, PETSC_COMM_WORLD, PETSC_ERR_LIB, "Destroying Ceed object failed");
 
   PetscCall(PetscFree(data));
   PetscFunctionReturn(PETSC_SUCCESS);
@@ -298,10 +294,13 @@ PetscErrorCode CreateStatsSF(Ceed ceed, CeedData ceed_data, SpanStatsSetupData s
 
 // @brief Setup RHS and LHS for L^2 projection of statistics
 PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data, SpanStatsSetupData stats_data) {
-  CeedOperator  op_mass, op_setup_sur, op_proj_rhs;
-  CeedQFunction qf_mass, qf_stats_proj;
-  CeedInt       q_data_size, num_comp_stats = user->spanstats.num_comp_stats;
-  MPI_Comm      comm = PetscObjectComm((PetscObject)user->spanstats.dm);
+  CeedOperator        op_mass, op_proj_rhs;
+  CeedQFunction       qf_mass, qf_stats_proj;
+  CeedInt             q_data_size, num_comp_stats = user->spanstats.num_comp_stats;
+  CeedElemRestriction elem_restr_qd;
+  CeedVector          q_data;
+  DMLabel             domain_label = NULL;
+  PetscInt            label_value  = 0;
 
   PetscFunctionBeginUser;
   // -- Create Operator for RHS of L^2 projection of statistics
@@ -314,33 +313,24 @@ PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data,
   PetscCallCeed(ceed, CeedOperatorSetField(op_proj_rhs, "output", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE));
 
   PetscCall(OperatorApplyContextCreate(NULL, user->spanstats.dm, ceed, op_proj_rhs, NULL, NULL, NULL, NULL, &user->spanstats.op_proj_rhs_ctx));
-  PetscCall(CeedOperatorCreateLocalVecs(op_proj_rhs, DMReturnVecType(user->spanstats.dm), comm, &user->spanstats.Parent_Stats_loc, NULL));
-
-  // -- Setup LHS of L^2 projection
-  // Get q_data for mass matrix operator
-  PetscCallCeed(ceed, CeedOperatorCreate(ceed, ceed_data->qf_setup_sur, NULL, NULL, &op_setup_sur));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "dx", stats_data->elem_restr_parent_x, stats_data->basis_x, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, stats_data->basis_x, CEED_VECTOR_NONE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_setup_sur, "surface qdata", stats_data->elem_restr_parent_qd, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorApply(op_setup_sur, stats_data->x_coord, stats_data->q_data, CEED_REQUEST_IMMEDIATE));
-
-  // CEED Restriction
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(stats_data->elem_restr_parent_qd, &q_data_size));
+  PetscCall(CeedOperatorCreateLocalVecs(op_proj_rhs, DMReturnVecType(user->spanstats.dm), PETSC_COMM_SELF, &user->spanstats.Parent_Stats_loc, NULL));
+  PetscCall(QDataGet(ceed, user->spanstats.dm, domain_label, label_value, stats_data->elem_restr_parent_x, stats_data->basis_x, stats_data->x_coord,
+                     &elem_restr_qd, &q_data, &q_data_size));
 
   // Create Mass CeedOperator
   PetscCall(CreateMassQFunction(ceed, num_comp_stats, q_data_size, &qf_mass));
   PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_mass, NULL, NULL, &op_mass));
   PetscCallCeed(ceed, CeedOperatorSetField(op_mass, "u", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_mass, "qdata", stats_data->elem_restr_parent_qd, CEED_BASIS_NONE, stats_data->q_data));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_mass, "qdata", elem_restr_qd, CEED_BASIS_NONE, q_data));
   PetscCallCeed(ceed, CeedOperatorSetField(op_mass, "v", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE));
 
   {  // Setup KSP for L^2 projection
     Mat mat_mass;
     KSP ksp;
 
-    PetscCall(MatCeedCreate(user->spanstats.dm, user->spanstats.dm, op_mass, NULL, &mat_mass));
+    PetscCall(MatCreateCeed(user->spanstats.dm, user->spanstats.dm, op_mass, NULL, &mat_mass));
 
-    PetscCall(KSPCreate(comm, &ksp));
+    PetscCall(KSPCreate(PetscObjectComm((PetscObject)user->spanstats.dm), &ksp));
     PetscCall(KSPSetOptionsPrefix(ksp, "turbulence_spanstats_"));
     {
       PC pc;
@@ -357,10 +347,11 @@ PetscErrorCode SetupL2ProjectionStats(Ceed ceed, User user, CeedData ceed_data,
   }
 
   // Cleanup
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_qd));
+  PetscCallCeed(ceed, CeedVectorDestroy(&q_data));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_mass));
   PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_stats_proj));
   PetscCallCeed(ceed, CeedOperatorDestroy(&op_mass));
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_setup_sur));
   PetscCallCeed(ceed, CeedOperatorDestroy(&op_proj_rhs));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
@@ -385,8 +376,9 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData
     case STATEVAR_CONSERVATIVE:
       PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ChildStatsCollection_Conserv, ChildStatsCollection_Conserv_loc, &qf_stats_collect));
       break;
-    default:
-      SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "No statisics collection available for chosen state variable");
+    case STATEVAR_ENTROPY:
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ChildStatsCollection_Entropy, ChildStatsCollection_Entropy_loc, &qf_stats_collect));
+      break;
   }
 
   if (user->spanstats.do_mms_test) {
@@ -405,9 +397,9 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData
 
     PetscCallCeed(ceed, CeedQFunctionContextRegisterDouble(collect_context, "solution time",
                                                            offsetof(struct Turbulence_SpanStatsContext_, solution_time), 1, "Current solution time"));
-    PetscCallCeed(
-        ceed, CeedQFunctionContextRegisterDouble(collect_context, "previous time", offsetof(struct Turbulence_SpanStatsContext_, previous_time), 1,
-                                                 "Previous time statistics collection was done"));
+    PetscCallCeed(ceed,
+                  CeedQFunctionContextRegisterDouble(collect_context, "previous time", offsetof(struct Turbulence_SpanStatsContext_, previous_time),
+                                                     1, "Previous time statistics collection was done"));
 
     PetscCallCeed(ceed, CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, &newtonian_ig_ctx));
   }
@@ -431,7 +423,7 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData
   PetscCall(OperatorApplyContextCreate(user->dm, user->spanstats.dm, user->ceed, op_stats_collect, user->q_ceed, NULL, NULL, NULL,
                                        &user->spanstats.op_stats_collect_ctx));
 
-  PetscCall(CeedOperatorCreateLocalVecs(op_stats_collect, DMReturnVecType(user->spanstats.dm), PetscObjectComm((PetscObject)user->spanstats.dm), NULL,
+  PetscCall(CeedOperatorCreateLocalVecs(op_stats_collect, DMReturnVecType(user->spanstats.dm), PETSC_COMM_SELF, NULL,
                                         &user->spanstats.Child_Stats_loc));
   PetscCall(VecZeroEntries(user->spanstats.Child_Stats_loc));
 
@@ -442,13 +434,18 @@ PetscErrorCode CreateStatisticCollectionOperator(Ceed ceed, User user, CeedData
 
 // Creates operator for calculating error of method of manufactured solution (MMS) test
 PetscErrorCode SetupMMSErrorChecking(Ceed ceed, User user, CeedData ceed_data, SpanStatsSetupData stats_data) {
-  CeedInt       num_comp_stats = user->spanstats.num_comp_stats, num_comp_x, q_data_size;
-  CeedQFunction qf_error;
-  CeedOperator  op_error;
-  CeedVector    x_ceed, y_ceed;
+  CeedInt             num_comp_stats = user->spanstats.num_comp_stats, num_comp_x, q_data_size;
+  CeedQFunction       qf_error;
+  CeedOperator        op_error;
+  CeedVector          x_ceed, y_ceed;
+  DMLabel             domain_label = NULL;
+  PetscInt            label_value  = 0;
+  CeedVector          q_data;
+  CeedElemRestriction elem_restr_parent_qd;
 
   PetscFunctionBeginUser;
-  PetscCallCeed(ceed, CeedElemRestrictionGetNumComponents(stats_data->elem_restr_parent_qd, &q_data_size));
+  PetscCall(QDataGet(ceed, user->spanstats.dm, domain_label, label_value, stats_data->elem_restr_parent_x, stats_data->basis_x, stats_data->x_coord,
+                     &elem_restr_parent_qd, &q_data, &q_data_size));
   PetscCallCeed(ceed, CeedBasisGetNumComponents(stats_data->basis_x, &num_comp_x));
 
   PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, ChildStatsCollectionMMSTest_Error, ChildStatsCollectionMMSTest_Error_loc, &qf_error));
@@ -459,7 +456,7 @@ PetscErrorCode SetupMMSErrorChecking(Ceed ceed, User user, CeedData ceed_data, S
 
   PetscCallCeed(ceed, CeedOperatorCreate(ceed, qf_error, NULL, NULL, &op_error));
   PetscCallCeed(ceed, CeedOperatorSetField(op_error, "q", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE));
-  PetscCallCeed(ceed, CeedOperatorSetField(op_error, "qdata", stats_data->elem_restr_parent_qd, CEED_BASIS_NONE, stats_data->q_data));
+  PetscCallCeed(ceed, CeedOperatorSetField(op_error, "qdata", elem_restr_parent_qd, CEED_BASIS_NONE, q_data));
   PetscCallCeed(ceed, CeedOperatorSetField(op_error, "x", stats_data->elem_restr_parent_x, stats_data->basis_x, stats_data->x_coord));
   PetscCallCeed(ceed, CeedOperatorSetField(op_error, "v", stats_data->elem_restr_parent_stats, stats_data->basis_stats, CEED_VECTOR_ACTIVE));
 
@@ -468,10 +465,12 @@ PetscErrorCode SetupMMSErrorChecking(Ceed ceed, User user, CeedData ceed_data, S
   PetscCall(OperatorApplyContextCreate(user->spanstats.dm, user->spanstats.dm, user->ceed, op_error, x_ceed, y_ceed, NULL, NULL,
                                        &user->spanstats.mms_error_ctx));
 
-  PetscCallCeed(ceed, CeedOperatorDestroy(&op_error));
-  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_error));
+  PetscCallCeed(ceed, CeedVectorDestroy(&q_data));
   PetscCallCeed(ceed, CeedVectorDestroy(&x_ceed));
   PetscCallCeed(ceed, CeedVectorDestroy(&y_ceed));
+  PetscCallCeed(ceed, CeedElemRestrictionDestroy(&elem_restr_parent_qd));
+  PetscCallCeed(ceed, CeedQFunctionDestroy(&qf_error));
+  PetscCallCeed(ceed, CeedOperatorDestroy(&op_error));
   PetscFunctionReturn(PETSC_SUCCESS);
 }
 
diff --git a/examples/fluids/src/velocity_gradient_projection.c b/examples/fluids/src/velocity_gradient_projection.c
index 7b1f970d72..277da68ee1 100644
--- a/examples/fluids/src/velocity_gradient_projection.c
+++ b/examples/fluids/src/velocity_gradient_projection.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -22,8 +22,8 @@ PetscErrorCode VelocityGradientProjectionCreateDM(NodalProjectionData grad_velo_
   PetscCall(DMClone(user->dm, &grad_velo_proj->dm));
   PetscCall(PetscObjectSetName((PetscObject)grad_velo_proj->dm, "Velocity Gradient Projection"));
 
-  PetscCall(
-      DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, user->app_ctx->degree, 1, user->app_ctx->q_extra, 1, &grad_velo_proj->num_comp, grad_velo_proj->dm));
+  PetscCall(DMSetupByOrder_FEM(PETSC_TRUE, PETSC_TRUE, user->app_ctx->degree, 1, user->app_ctx->q_extra, 1, &grad_velo_proj->num_comp,
+                               grad_velo_proj->dm));
 
   PetscCall(DMGetLocalSection(grad_velo_proj->dm, &section));
   PetscCall(PetscSectionSetFieldName(section, 0, ""));
@@ -67,15 +67,17 @@ PetscErrorCode VelocityGradientProjectionSetup(Ceed ceed, User user, CeedData ce
   // -- Build RHS operator
   switch (state_var_input) {
     case STATEVAR_PRIMITIVE:
-      PetscCallCeed(
-          ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Prim, VelocityGradientProjectionRHS_Prim_loc, &qf_rhs_assemble));
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Prim, VelocityGradientProjectionRHS_Prim_loc,
+                                                      &qf_rhs_assemble));
       break;
     case STATEVAR_CONSERVATIVE:
       PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Conserv, VelocityGradientProjectionRHS_Conserv_loc,
                                                       &qf_rhs_assemble));
       break;
-    default:
-      SETERRQ(PetscObjectComm((PetscObject)user->dm), PETSC_ERR_SUP, "No velocity gradient projection QFunction for chosen state variable");
+    case STATEVAR_ENTROPY:
+      PetscCallCeed(ceed, CeedQFunctionCreateInterior(ceed, 1, VelocityGradientProjectionRHS_Entropy, VelocityGradientProjectionRHS_Entropy_loc,
+                                                      &qf_rhs_assemble));
+      break;
   }
 
   PetscCallCeed(ceed, CeedQFunctionSetContext(qf_rhs_assemble, problem->apply_vol_ifunction.qfunction_context));
@@ -105,7 +107,7 @@ PetscErrorCode VelocityGradientProjectionSetup(Ceed ceed, User user, CeedData ce
     Mat      mat_mass;
     MPI_Comm comm = PetscObjectComm((PetscObject)grad_velo_proj->dm);
 
-    PetscCall(MatCeedCreate(grad_velo_proj->dm, grad_velo_proj->dm, op_mass, NULL, &mat_mass));
+    PetscCall(MatCreateCeed(grad_velo_proj->dm, grad_velo_proj->dm, op_mass, NULL, &mat_mass));
 
     PetscCall(KSPCreate(comm, &grad_velo_proj->ksp));
     PetscCall(KSPSetOptionsPrefix(grad_velo_proj->ksp, "velocity_gradient_projection_"));
diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin
new file mode 100644
index 0000000000..af70688040
Binary files /dev/null and b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL-entropy.bin differ
diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin
index facbebe2d6..9ae8647455 100644
Binary files a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin and b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-IDL.bin differ
diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin
index bd8ea4f163..27826f39c2 100644
Binary files a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin and b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-explicit.bin differ
diff --git a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin
index ab4052b1dc..5b9252ae28 100644
Binary files a/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin and b/examples/fluids/tests-output/fluids-navierstokes-gaussianwave-shell.bin differ
diff --git a/examples/fluids/vortexshedding.yaml b/examples/fluids/vortexshedding.yaml
index 9541a94c7e..9b910da328 100644
--- a/examples/fluids/vortexshedding.yaml
+++ b/examples/fluids/vortexshedding.yaml
@@ -33,7 +33,7 @@ outflow:
 freestream:
   velocity: 1,0,0
 # Small gravity vector to break symmetry so shedding can start
-g: 0,-.01,0
+gravity: 0,-.01,0
 
 # viscosity corresponds to Reynolds number 100
 mu: 0.01
@@ -44,11 +44,11 @@ degree: 3
 dm_plex_filename: examples/fluids/meshes/cylinder-q1-n08.msh
 
 # Boundary Settings
-bc_slip_z: 6
+bc_symmetry_z: 6
 bc_wall: 5
 bc_freestream: 1
 bc_outflow: 2
-bc_slip_y: 3,4
+bc_symmetry_y: 3,4
 wall_comps: 1,2,3
 
 # Primitive variables are preferred at low Mach number
diff --git a/examples/mfem/Makefile b/examples/mfem/Makefile
index cb5abeba01..6b042926fe 100644
--- a/examples/mfem/Makefile
+++ b/examples/mfem/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/README.md b/examples/mfem/README.md
new file mode 100644
index 0000000000..d6d2002177
--- /dev/null
+++ b/examples/mfem/README.md
@@ -0,0 +1,18 @@
+## libCEED MFEM Examples
+
+These examples show to write libCEED operators (BP1 and BP3) within the open-source finite element library [MFEM](https://www.mfem.org/).
+
+First compile MFEM and libCEED individually. After that, compile the MFEM example:
+
+```bash
+export MFEM_DIR=/path/to/mfem
+make
+```
+
+To run the executable, write:
+
+```
+./bp[1, 3]
+```
+
+Optional command-line arguments are shown by adding the command-line argument "--help".
diff --git a/examples/mfem/bp1.cpp b/examples/mfem/bp1.cpp
index f6a96dd536..096a6aeee7 100644
--- a/examples/mfem/bp1.cpp
+++ b/examples/mfem/bp1.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp1.h b/examples/mfem/bp1.h
index 332340340f..3e6fe273c8 100644
--- a/examples/mfem/bp1.h
+++ b/examples/mfem/bp1.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 /// A structure used to pass additional data to f_build_mass
 struct BuildContext {
diff --git a/examples/mfem/bp1.hpp b/examples/mfem/bp1.hpp
index cb43675b56..912346857c 100644
--- a/examples/mfem/bp1.hpp
+++ b/examples/mfem/bp1.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/mfem/bp3.cpp b/examples/mfem/bp3.cpp
index d4b8eb24e9..779a75f3a2 100644
--- a/examples/mfem/bp3.cpp
+++ b/examples/mfem/bp3.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -188,6 +188,7 @@ int main(int argc, char *argv[]) {
   delete fespace;
   delete fec;
   delete mesh;
+  delete D;
   CeedDestroy(&ceed);
   return 0;
 }
diff --git a/examples/mfem/bp3.h b/examples/mfem/bp3.h
index dde37b7446..bc73b3acab 100644
--- a/examples/mfem/bp3.h
+++ b/examples/mfem/bp3.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 /// A structure used to pass additional data to f_build_diff and f_apply_diff
 struct BuildContext {
diff --git a/examples/mfem/bp3.hpp b/examples/mfem/bp3.hpp
index d9b74474d0..36b88b3697 100644
--- a/examples/mfem/bp3.hpp
+++ b/examples/mfem/bp3.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/nek/README.md b/examples/nek/README.md
index 6c1cfdee44..1b8faec64d 100644
--- a/examples/nek/README.md
+++ b/examples/nek/README.md
@@ -2,7 +2,7 @@
 
 ### Prerequisites
 
-Nek5000 v18.0 or greater must be [installed](https://nek5000.mcs.anl.gov/getstarted/) to run these examples.
+Nek5000 v19.0 or greater must be [installed](https://nek5000.mcs.anl.gov/getstarted/) to run these examples.
 It is assumed to exist at `../../../Nek5000` (a sibling to the libCEED directory) or at a path defined in the environment variable `NEK5K_DIR`.
 For example, you could set
 ```sh
diff --git a/examples/nek/bps/bps.h b/examples/nek/bps/bps.h
index 446377b044..5de48e9e34 100644
--- a/examples/nek/bps/bps.h
+++ b/examples/nek/bps/bps.h
@@ -1,15 +1,15 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
+#pragma once
 
-#ifndef bps_h
-#define bps_h
-
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #ifndef M_PI
 #define M_PI 3.14159265358979323846
@@ -110,5 +110,3 @@ CEED_QFUNCTION(diffusionf)(void *ctx, CeedInt Q, const CeedScalar *const *in, Ce
   }  // End of Quadrature Point Loop
   return 0;
 }
-
-#endif  // bps_h
diff --git a/examples/nek/bps/bps.usr b/examples/nek/bps/bps.usr
index 89f7fca164..f5021d78d9 100644
--- a/examples/nek/bps/bps.usr
+++ b/examples/nek/bps/bps.usr
@@ -1,4 +1,4 @@
-C Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+C Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 C All Rights Reserved. See the top-level COPYRIGHT and NOTICE files for details.
 C
 C SPDX-License-Identifier: (BSD-2-Clause)
diff --git a/examples/petsc/Makefile b/examples/petsc/Makefile
index b465d25cfe..d66fd3176a 100644
--- a/examples/petsc/Makefile
+++ b/examples/petsc/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
@@ -15,7 +15,7 @@ CEED_DIR ?= ../..
 ceed.pc := $(CEED_DIR)/lib/pkgconfig/ceed.pc
 
 CC = $(call pkgconf, --variable=ccompiler $(PETSc.pc) $(ceed.pc))
-CFLAGS = -std=c99 \
+CFLAGS = -std=c11 \
   $(call pkgconf, --variable=cflags_extra $(PETSc.pc)) \
   $(call pkgconf, --cflags-only-other $(PETSc.pc)) \
   $(OPT)
diff --git a/examples/petsc/README.md b/examples/petsc/README.md
index 4ec9e4baff..b63e7d0e98 100644
--- a/examples/petsc/README.md
+++ b/examples/petsc/README.md
@@ -1,6 +1,6 @@
 ## libCEED + PETSc Examples
 
-PETSc v3.17 or a development version of PETSc at commit 0e95d842 or later is required.
+This page provides a description of the CEED bakeoff problem examples for the libCEED library, based on PETSc.
 
 ### CEED bakeoff problems with raw mesh management - bpsraw
 
@@ -17,7 +17,6 @@ In addition to the common arguments, the following arguments may be set:
 ### CEED bakeoff problems with DMPlex - bps
 
 This code solves the CEED bakeoff problems on a unstructured grid using DMPlex.
-This example requires a PETSc version later than 3.11.3.
 
 To build, run `make bps`
 
@@ -43,7 +42,6 @@ The resulting log file can be read by the Python plotting scripts in `benchmarks
 ### CEED bakeoff problems with DMPlex and PCMG - multigrid
 
 This code solves the CEED bakeoff problems on a unstructured grid using DMPlex with p-multigrid implemented in PCMG.
-This example requires a PETSc version later than 3.11.3.
 
 To build, run `make multigrid`
 
diff --git a/examples/petsc/area.c b/examples/petsc/area.c
index c72de7d6fa..1b146a4c21 100644
--- a/examples/petsc/area.c
+++ b/examples/petsc/area.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -71,7 +71,7 @@ int main(int argc, char **argv) {
   Ceed                 ceed;
   CeedData             ceed_data;
   ProblemType          problem_choice;
-  VecType              vec_type;
+  VecType              vec_type = VECSTANDARD;
   PetscMemType         mem_type;
 
   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
@@ -110,15 +110,6 @@ int main(int argc, char **argv) {
   // Create DM
   PetscCall(SetupDMByDegree(dm, degree, q_extra, num_comp_u, topo_dim, false));
 
-  // Create vectors
-  PetscCall(DMCreateGlobalVector(dm, &U));
-  PetscCall(VecGetLocalSize(U, &l_size));
-  PetscCall(VecGetSize(U, &g_size));
-  PetscCall(DMCreateLocalVector(dm, &U_loc));
-  PetscCall(VecGetSize(U_loc, &xl_size));
-  PetscCall(VecDuplicate(U, &V));
-  PetscCall(VecDuplicate(U_loc, &V_loc));
-
   // Setup op_apply_ctx structure
   PetscCall(PetscMalloc1(1, &op_apply_ctx));
 
@@ -127,23 +118,30 @@ int main(int argc, char **argv) {
   CeedMemType mem_type_backend;
   CeedGetPreferredMemType(ceed, &mem_type_backend);
 
-  PetscCall(DMGetVecType(dm, &vec_type));
-  if (!vec_type) {  // Not yet set by op_apply_ctx -dm_vec_type
-    switch (mem_type_backend) {
-      case CEED_MEM_HOST:
-        vec_type = VECSTANDARD;
-        break;
-      case CEED_MEM_DEVICE: {
-        const char *resolved;
-        CeedGetResource(ceed, &resolved);
-        if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-        else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
-        else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
-        else vec_type = VECSTANDARD;
-      }
+  // Set mesh vec_type
+  switch (mem_type_backend) {
+    case CEED_MEM_HOST:
+      vec_type = VECSTANDARD;
+      break;
+    case CEED_MEM_DEVICE: {
+      const char *resolved;
+
+      CeedGetResource(ceed, &resolved);
+      if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
+      else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
+      else vec_type = VECSTANDARD;
     }
-    PetscCall(DMSetVecType(dm, vec_type));
   }
+  PetscCall(DMSetVecType(dm, vec_type));
+
+  // Create vectors
+  PetscCall(DMCreateGlobalVector(dm, &U));
+  PetscCall(VecGetLocalSize(U, &l_size));
+  PetscCall(VecGetSize(U, &g_size));
+  PetscCall(DMCreateLocalVector(dm, &U_loc));
+  PetscCall(VecGetSize(U_loc, &xl_size));
+  PetscCall(VecDuplicate(U, &V));
+  PetscCall(VecDuplicate(U_loc, &V_loc));
 
   // Print summary
   if (!test_mode) {
@@ -168,7 +166,7 @@ int main(int argc, char **argv) {
   // Setup libCEED's objects and apply setup operator
   PetscCall(PetscMalloc1(1, &ceed_data));
   PetscCall(SetupLibceedByDegree(dm, ceed, degree, topo_dim, q_extra, num_comp_x, num_comp_u, g_size, xl_size, problem_options[problem_choice],
-                                 ceed_data, false, (CeedVector)NULL, (CeedVector *)NULL));
+                                 ceed_data, false, true, (CeedVector)NULL, (CeedVector *)NULL));
 
   // Setup output vector
   PetscCall(VecZeroEntries(V_loc));
diff --git a/examples/petsc/area.h b/examples/petsc/area.h
index 1b95f6d6df..fd36dd79df 100644
--- a/examples/petsc/area.h
+++ b/examples/petsc/area.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bps.c b/examples/petsc/bps.c
index 29101e1379..a00ee650c8 100644
--- a/examples/petsc/bps.c
+++ b/examples/petsc/bps.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -25,7 +25,9 @@
 //     ./bps -problem bp6 -degree 3 -ceed /gpu/cuda
 //
 //TESTARGS(name="BP3, tet elements") -ceed {ceed_resource} -test -problem bp3 -degree 3 -ksp_max_it_clip 50,50 -simplex
-//TESTARGS(name="BP5, hex elements") -ceed {ceed_resource} -test -problem bp5 -degree 3 -ksp_max_it_clip 15,15
+//TESTARGS(name="BP5, hex elements") -ceed {ceed_resource} -test -problem bp5 -degree 3 -ksp_max_it_clip 18,18
+//TESTARGS(name="BP1+3, hex elements") -ceed {ceed_resource} -test -problem bp1_3 -degree 3 -ksp_max_it_clip 18,18
+//TESTARGS(name="BP2+4, hex elements") -ceed {ceed_resource} -test -problem bp2_4 -degree 3 -ksp_max_it_clip 18,18
 
 /// @file
 /// CEED BPs example using PETSc with DMPlex
@@ -62,7 +64,7 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
   CeedQFunction        qf_error;
   CeedOperator         op_error;
   CeedVector           rhs_ceed, target;
-  VecType              vec_type;
+  VecType              vec_type = VECSTANDARD;
   PetscMemType         mem_type;
 
   PetscFunctionBeginUser;
@@ -71,23 +73,22 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
   CeedMemType mem_type_backend;
   CeedGetPreferredMemType(ceed, &mem_type_backend);
 
-  PetscCall(DMGetVecType(dm, &vec_type));
-  if (!vec_type) {  // Not yet set by user -dm_vec_type
-    switch (mem_type_backend) {
-      case CEED_MEM_HOST:
-        vec_type = VECSTANDARD;
-        break;
-      case CEED_MEM_DEVICE: {
-        const char *resolved;
-        CeedGetResource(ceed, &resolved);
-        if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-        else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
-        else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
-        else vec_type = VECSTANDARD;
-      }
+  // Set mesh vec_type
+  switch (mem_type_backend) {
+    case CEED_MEM_HOST:
+      vec_type = VECSTANDARD;
+      break;
+    case CEED_MEM_DEVICE: {
+      const char *resolved;
+
+      CeedGetResource(ceed, &resolved);
+      if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
+      else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
+      else vec_type = VECSTANDARD;
     }
-    PetscCall(DMSetVecType(dm, vec_type));
   }
+  PetscCall(DMSetVecType(dm, vec_type));
+  PetscCall(DMSetFromOptions(dm));
 
   // Create global and local solution vectors
   PetscCall(DMCreateGlobalVector(dm, &X));
@@ -112,6 +113,15 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
     const char *used_resource;
     CeedGetResource(ceed, &used_resource);
 
+    bool is_combined_bp = rp->bp_choice > CEED_BP6;
+    char bp_name[6]     = "";
+
+    if (is_combined_bp) {
+      PetscCall(PetscSNPrintf(bp_name, 6, "%d + %d", rp->bp_choice % 2 ? 2 : 1, rp->bp_choice - CEED_BP4));
+    } else {
+      PetscCall(PetscSNPrintf(bp_name, 6, "%d", rp->bp_choice + 1));
+    }
+
     VecType vec_type;
     PetscCall(VecGetType(X, &vec_type));
 
@@ -123,7 +133,7 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
     PetscMPIInt      comm_size;
     PetscCall(MPI_Comm_size(rp->comm, &comm_size));
     PetscCall(PetscPrintf(rp->comm,
-                          "\n-- CEED Benchmark Problem %" CeedInt_FMT " -- libCEED + PETSc --\n"
+                          "\n-- CEED Benchmark Problem %s -- libCEED + PETSc --\n"
                           "  MPI:\n"
                           "    Hostname                                : %s\n"
                           "    Total ranks                             : %d\n"
@@ -142,8 +152,8 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
                           "    Element topology                        : %s\n"
                           "    Owned nodes                             : %" PetscInt_FMT "\n"
                           "    DoF per node                            : %" PetscInt_FMT "\n",
-                          rp->bp_choice + 1, rp->hostname, comm_size, rp->ranks_per_node, vec_type, used_resource, CeedMemTypes[mem_type_backend], P,
-                          Q, rp->q_extra, g_size / rp->num_comp_u, c_end - c_start, CeedElemTopologies[elem_topo], l_size / rp->num_comp_u,
+                          bp_name, rp->hostname, comm_size, rp->ranks_per_node, vec_type, used_resource, CeedMemTypes[mem_type_backend], P, Q,
+                          rp->q_extra, g_size / rp->num_comp_u, c_end - c_start, CeedElemTopologies[elem_topo], l_size / rp->num_comp_u,
                           rp->num_comp_u));
   }
 
@@ -155,7 +165,7 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
 
   PetscCall(PetscMalloc1(1, &ceed_data));
   PetscCall(SetupLibceedByDegree(dm, ceed, rp->degree, rp->dim, rp->q_extra, rp->dim, rp->num_comp_u, g_size, xl_size, bp_options[rp->bp_choice],
-                                 ceed_data, true, rhs_ceed, &target));
+                                 ceed_data, true, true, rhs_ceed, &target));
 
   // Gather RHS
   PetscCall(VecC2P(rhs_ceed, mem_type, rhs_loc));
@@ -183,9 +193,10 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
   {
     PC pc;
     PetscCall(KSPGetPC(ksp, &pc));
-    if (rp->bp_choice == CEED_BP1 || rp->bp_choice == CEED_BP2) {
+    if (rp->bp_choice == CEED_BP1 || rp->bp_choice == CEED_BP2 || rp->bp_choice == CEED_BP13 || rp->bp_choice == CEED_BP24 ||
+        rp->bp_choice == CEED_BP15 || rp->bp_choice == CEED_BP26) {
       PetscCall(PCSetType(pc, PCJACOBI));
-      if (rp->simplex) {
+      if (rp->simplex || rp->bp_choice == CEED_BP13 || rp->bp_choice == CEED_BP24 || rp->bp_choice == CEED_BP15 || rp->bp_choice == CEED_BP26) {
         PetscCall(PCJacobiSetType(pc, PC_JACOBI_DIAGONAL));
       } else {
         PetscCall(PCJacobiSetType(pc, PC_JACOBI_ROWSUM));
@@ -255,7 +266,11 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource)
       PetscCall(SetupErrorOperatorCtx(rp->comm, dm, ceed, ceed_data, X_loc, op_error, op_error_ctx));
       PetscScalar l2_error;
       PetscCall(ComputeL2Error(X, &l2_error, op_error_ctx));
-      PetscReal tol = 5e-2;
+      // Tighter tol for BP1, BP2
+      // Looser tol for BP3, BP4, BP5, and BP6 with extra for vector valued problems
+      // BP1+3 and BP2+4 follow the pattern for BP3 and BP4
+      // BP1+5 and BP2+6 follow the pattern for BP5 and BP6
+      PetscReal tol = rp->bp_choice < CEED_BP3 ? 5e-4 : (5e-2 + (rp->bp_choice % 2 == 1 ? 5e-3 : 0));
       if (!rp->test_mode || l2_error > tol) {
         PetscCall(MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, rp->comm));
         PetscCall(MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, rp->comm));
diff --git a/examples/petsc/bps.h b/examples/petsc/bps.h
index 9100c8af47..95d4a4c644 100644
--- a/examples/petsc/bps.h
+++ b/examples/petsc/bps.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -17,4 +17,4 @@ static const char *const mem_types[] = {"host", "device", "memType", "CEED_MEM_"
 typedef enum { COARSEN_UNIFORM = 0, COARSEN_LOGARITHMIC = 1 } CoarsenType;
 static const char *const coarsen_types[] = {"uniform", "logarithmic", "CoarsenType", "COARSEN", 0};
 
-static const char *const bp_types[] = {"bp1", "bp2", "bp3", "bp4", "bp5", "bp6", "BPType", "CEED_BP", 0};
+static const char *const bp_types[] = {"bp1", "bp2", "bp3", "bp4", "bp5", "bp6", "bp1_3", "bp2_4", "bp1_5", "bp2_6", "BPType", "CEED_BP", 0};
diff --git a/examples/petsc/bpsraw.c b/examples/petsc/bpsraw.c
index 5e2cdac76f..5bb10f4bd7 100644
--- a/examples/petsc/bpsraw.c
+++ b/examples/petsc/bpsraw.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -385,8 +385,8 @@ int main(int argc, char **argv) {
   PetscInt two       = 2;
   ksp_max_it_clip[0] = 5;
   ksp_max_it_clip[1] = 20;
-  PetscCall(
-      PetscOptionsIntArray("-ksp_max_it_clip", "Min and max number of iterations to use during benchmarking", NULL, ksp_max_it_clip, &two, NULL));
+  PetscCall(PetscOptionsIntArray("-ksp_max_it_clip", "Min and max number of iterations to use during benchmarking", NULL, ksp_max_it_clip, &two,
+                                 NULL));
   PetscOptionsEnd();
   P = degree + 1;
   Q = P + q_extra;
@@ -403,9 +403,9 @@ int main(int argc, char **argv) {
       break;
     case CEED_MEM_DEVICE: {
       const char *resolved;
+
       CeedGetResource(ceed, &resolved);
       if (strstr(resolved, "/gpu/cuda")) default_vec_type = VECCUDA;
-      else if (strstr(resolved, "/gpu/hip/occa")) default_vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
       else if (strstr(resolved, "/gpu/hip")) default_vec_type = VECHIP;
       else default_vec_type = VECSTANDARD;
     }
@@ -438,9 +438,6 @@ int main(int argc, char **argv) {
   PetscCall(VecSetFromOptions(X));
   PetscCall(VecSetUp(X));
 
-  // Set up libCEED
-  CeedInit(ceed_resource, &ceed);
-
   // Print summary
   PetscInt gsize;
 
@@ -509,8 +506,9 @@ int main(int argc, char **argv) {
           l_to_g_ind[here] = g_start[ir][jr][kr] + (ii * g_m_nodes[ir][jr][kr][1] + jj) * g_m_nodes[ir][jr][kr][2] + kk;
           if ((i_rank[0] == 0 && i == 0) || (i_rank[1] == 0 && j == 0) || (i_rank[2] == 0 && k == 0) ||
               (i_rank[0] + 1 == p[0] && i + 1 == l_nodes[0]) || (i_rank[1] + 1 == p[1] && j + 1 == l_nodes[1]) ||
-              (i_rank[2] + 1 == p[2] && k + 1 == l_nodes[2]))
+              (i_rank[2] + 1 == p[2] && k + 1 == l_nodes[2])) {
             continue;
+          }
           l_to_g_ind_0[l_0_count] = l_to_g_ind[here];
           loc_ind[l_0_count++]    = here;
         }
@@ -768,8 +766,8 @@ int main(int argc, char **argv) {
       }
     }
     if (!test_mode) {
-      PetscCall(
-          PetscPrintf(comm, "    DoFs/Sec in CG                     : %g (%g) million\n", 1e-6 * gsize * its / rt_max, 1e-6 * gsize * its / rt_min));
+      PetscCall(PetscPrintf(comm, "    DoFs/Sec in CG                     : %g (%g) million\n", 1e-6 * gsize * its / rt_max,
+                            1e-6 * gsize * its / rt_min));
     }
   }
 
@@ -798,21 +796,22 @@ int main(int argc, char **argv) {
   CeedVectorDestroy(&op_apply_ctx->y_ceed);
   CeedVectorDestroy(&op_apply_ctx->q_data);
   CeedVectorDestroy(&target);
-  CeedOperatorDestroy(&op_setup_geo);
-  CeedOperatorDestroy(&op_setup_rhs);
-  CeedOperatorDestroy(&op_apply);
-  CeedOperatorDestroy(&op_error);
   CeedElemRestrictionDestroy(&elem_restr_u);
   CeedElemRestrictionDestroy(&elem_restr_x);
   CeedElemRestrictionDestroy(&elem_restr_u_i);
   CeedElemRestrictionDestroy(&elem_restr_qd_i);
+  CeedBasisDestroy(&basis_u);
+  CeedBasisDestroy(&basis_x);
   CeedQFunctionDestroy(&qf_setup_geo);
   CeedQFunctionDestroy(&qf_setup_rhs);
   CeedQFunctionDestroy(&qf_apply);
   CeedQFunctionDestroy(&qf_error);
-  CeedBasisDestroy(&basis_u);
-  CeedBasisDestroy(&basis_x);
+  CeedOperatorDestroy(&op_setup_geo);
+  CeedOperatorDestroy(&op_setup_rhs);
+  CeedOperatorDestroy(&op_apply);
+  CeedOperatorDestroy(&op_error);
   CeedDestroy(&ceed);
+
   PetscCall(PetscFree(op_apply_ctx));
   return PetscFinalize();
 }
diff --git a/examples/petsc/bpssphere.c b/examples/petsc/bpssphere.c
index d928a815c1..30489224b1 100644
--- a/examples/petsc/bpssphere.c
+++ b/examples/petsc/bpssphere.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -64,7 +64,7 @@ int main(int argc, char **argv) {
   CeedOperator         op_error;
   CeedVector           rhs_ceed, target;
   BPType               bp_choice;
-  VecType              vec_type;
+  VecType              vec_type = VECSTANDARD;
   PetscMemType         mem_type;
 
   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
@@ -92,6 +92,26 @@ int main(int argc, char **argv) {
   PetscCall(PetscOptionsBool("-simplex", "Use simplices, or tensor product cells", NULL, simplex, &simplex, NULL));
   PetscOptionsEnd();
 
+  // Set up libCEED
+  CeedInit(ceed_resource, &ceed);
+  CeedMemType mem_type_backend;
+  CeedGetPreferredMemType(ceed, &mem_type_backend);
+
+  // Set mesh vec_type
+  switch (mem_type_backend) {
+    case CEED_MEM_HOST:
+      vec_type = VECSTANDARD;
+      break;
+    case CEED_MEM_DEVICE: {
+      const char *resolved;
+
+      CeedGetResource(ceed, &resolved);
+      if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
+      else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
+      else vec_type = VECSTANDARD;
+    }
+  }
+
   // Setup DM
   if (read_mesh) {
     PetscCall(DMPlexCreateFromFile(PETSC_COMM_WORLD, filename, NULL, PETSC_TRUE, &dm));
@@ -104,6 +124,7 @@ int main(int argc, char **argv) {
     // Refine DMPlex with uniform refinement using runtime option -dm_refine
     PetscCall(DMPlexSetRefinementUniform(dm, PETSC_TRUE));
   }
+  PetscCall(DMSetVecType(dm, vec_type));
   PetscCall(DMSetFromOptions(dm));
   // View DMPlex via runtime option
   PetscCall(DMViewFromOptions(dm, NULL, "-dm_view"));
@@ -125,29 +146,6 @@ int main(int argc, char **argv) {
   PetscCall(MatCreateShell(comm, l_size, l_size, g_size, g_size, op_apply_ctx, &mat_O));
   PetscCall(MatShellSetOperation(mat_O, MATOP_MULT, (void (*)(void))MatMult_Ceed));
 
-  // Set up libCEED
-  CeedInit(ceed_resource, &ceed);
-  CeedMemType mem_type_backend;
-  CeedGetPreferredMemType(ceed, &mem_type_backend);
-
-  PetscCall(DMGetVecType(dm, &vec_type));
-  if (!vec_type) {  // Not yet set by user -dm_vec_type
-    switch (mem_type_backend) {
-      case CEED_MEM_HOST:
-        vec_type = VECSTANDARD;
-        break;
-      case CEED_MEM_DEVICE: {
-        const char *resolved;
-        CeedGetResource(ceed, &resolved);
-        if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-        else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
-        else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
-        else vec_type = VECSTANDARD;
-      }
-    }
-    PetscCall(DMSetVecType(dm, vec_type));
-  }
-
   // Print summary
   if (!test_mode) {
     PetscInt    P = degree + 1, Q = P + q_extra;
@@ -175,7 +173,7 @@ int main(int argc, char **argv) {
   // Setup libCEED's objects
   PetscCall(PetscMalloc1(1, &ceed_data));
   PetscCall(SetupLibceedByDegree(dm, ceed, degree, topo_dim, q_extra, num_comp_x, num_comp_u, g_size, xl_size, bp_options[bp_choice], ceed_data, true,
-                                 rhs_ceed, &target));
+                                 true, rhs_ceed, &target));
 
   // Gather RHS
   PetscCall(VecC2P(rhs_ceed, mem_type, rhs_loc));
diff --git a/examples/petsc/bpssphere.h b/examples/petsc/bpssphere.h
index c3c7678f54..c5d030bab8 100644
--- a/examples/petsc/bpssphere.h
+++ b/examples/petsc/bpssphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/bpsswarm.c b/examples/petsc/bpsswarm.c
index 5796cf7471..e4ba5aed4c 100644
--- a/examples/petsc/bpsswarm.c
+++ b/examples/petsc/bpsswarm.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -65,7 +65,7 @@ int main(int argc, char **argv) {
   CeedData             ceed_data;
   CeedOperator         op_error;
   BPType               bp_choice;
-  VecType              vec_type;
+  VecType              vec_type         = VECSTANDARD;
   PointSwarmType       point_swarm_type = SWARM_GAUSS;
   PetscMPIInt          ranks_per_node;
   char                 hostname[PETSC_MAX_PATH_LEN];
@@ -146,6 +146,26 @@ int main(int argc, char **argv) {
   }
   PetscOptionsEnd();
 
+  // Set up libCEED
+  CeedInit(ceed_resource, &ceed);
+  CeedMemType mem_type_backend;
+  CeedGetPreferredMemType(ceed, &mem_type_backend);
+
+  // Set background mesh vec_type
+  switch (mem_type_backend) {
+    case CEED_MEM_HOST:
+      vec_type = VECSTANDARD;
+      break;
+    case CEED_MEM_DEVICE: {
+      const char *resolved;
+
+      CeedGetResource(ceed, &resolved);
+      if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
+      else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
+      else vec_type = VECSTANDARD;
+    }
+  }
+
   // Setup DM
   if (read_mesh) {
     PetscCall(DMPlexCreateFromFile(comm, filename, NULL, PETSC_TRUE, &dm_mesh));
@@ -162,11 +182,13 @@ int main(int argc, char **argv) {
       PetscCheck(!is_simplex, comm, PETSC_ERR_USER, "Only tensor-product background meshes supported");
     }
   }
+  PetscCall(DMSetVecType(dm_mesh, vec_type));
+  PetscCall(DMSetFromOptions(dm_mesh));
+
   PetscCall(DMGetDimension(dm_mesh, &dim));
   PetscCall(SetupDMByDegree(dm_mesh, degree, q_extra, num_comp_u, dim, bp_options[bp_choice].enforce_bc));
 
   // View mesh
-  PetscCall(DMSetOptionsPrefix(dm_mesh, "final_"));
   PetscCall(DMViewFromOptions(dm_mesh, NULL, "-dm_view"));
 
   // Create particle swarm
@@ -209,29 +231,7 @@ int main(int argc, char **argv) {
   PetscCall(MatCreateShell(comm, l_size, l_size, g_size, g_size, op_apply_ctx, &mat_O));
   PetscCall(MatSetDM(mat_O, dm_mesh));
   PetscCall(MatShellSetOperation(mat_O, MATOP_MULT, (void (*)(void))MatMult_Ceed));
-
-  // Set up libCEED
-  CeedInit(ceed_resource, &ceed);
-  CeedMemType mem_type_backend;
-  CeedGetPreferredMemType(ceed, &mem_type_backend);
-
-  PetscCall(DMGetVecType(dm_mesh, &vec_type));
-  if (!vec_type) {  // Not yet set by user -dm_vec_type
-    switch (mem_type_backend) {
-      case CEED_MEM_HOST:
-        vec_type = VECSTANDARD;
-        break;
-      case CEED_MEM_DEVICE: {
-        const char *resolved;
-        CeedGetResource(ceed, &resolved);
-        if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-        else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
-        else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
-        else vec_type = VECSTANDARD;
-      }
-    }
-    PetscCall(DMSetVecType(dm_mesh, vec_type));
-  }
+  PetscCall(MatShellSetOperation(mat_O, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiag));
 
   // Print summary
   if (!test_mode) {
@@ -302,14 +302,9 @@ int main(int argc, char **argv) {
     PetscCall(KSPGetPC(ksp, &pc));
     if (bp_choice == CEED_BP1 || bp_choice == CEED_BP2) {
       PetscCall(PCSetType(pc, PCJACOBI));
-      PetscCall(PCJacobiSetType(pc, PC_JACOBI_ROWSUM));
+      PetscCall(PCJacobiSetType(pc, PC_JACOBI_DIAGONAL));
     } else {
       PetscCall(PCSetType(pc, PCNONE));
-      MatNullSpace nullspace;
-
-      PetscCall(MatNullSpaceCreate(PETSC_COMM_WORLD, PETSC_TRUE, 0, 0, &nullspace));
-      PetscCall(MatSetNullSpace(mat_O, nullspace));
-      PetscCall(MatNullSpaceDestroy(&nullspace));
     }
     PetscCall(KSPSetType(ksp, KSPCG));
     PetscCall(KSPSetNormType(ksp, KSP_NORM_NATURAL));
diff --git a/examples/petsc/dmswarm.c b/examples/petsc/dmswarm.c
index 557ace7ec2..fa95f16979 100644
--- a/examples/petsc/dmswarm.c
+++ b/examples/petsc/dmswarm.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -18,7 +18,7 @@
 //
 //  ./dmswarm -dm_plex_dim 3 -dm_plex_box_faces 3,3,3 -dm_plex_box_lower -1.0,-1.0,-1.0 -dm_plex_simplex 0 -num_comp 2 -swarm gauss
 //
-//TESTARGS(name="Uniform swarm, CG projection") -ceed {ceed_resource} -test -dm_plex_dim 3 -dm_plex_box_faces 3,3,3 -dm_plex_box_lower -1.0,-1.0,-1.0 -dm_plex_simplex 0 -dm_plex_hash_location true -num_comp 2 -swarm uniform -solution_order 3 -points_per_cell 125
+//TESTARGS(name="Uniform swarm, CG projection") -ceed {ceed_resource} -test -dm_plex_dim 3 -dm_plex_box_faces 3,3,3 -dm_plex_box_lower -1.0,-1.0,-1.0 -dm_plex_simplex 0 -dm_plex_hash_location true -num_comp 2 -swarm uniform -solution_order 3 -q_extra 0 -points_per_cell 125
 //TESTARGS(name="Gauss swarm, lumped projection") -ceed {ceed_resource} -test -dm_plex_dim 3 -dm_plex_box_faces 3,3,3 -dm_plex_box_lower -1.0,-1.0,-1.0 -dm_plex_simplex 0 -dm_plex_hash_location true -num_comp 2 -swarm gauss -ksp_type preonly -pc_type jacobi -pc_jacobi_type rowsum -tolerance 9e-2
 
 /// @file
@@ -82,10 +82,10 @@ int main(int argc, char **argv) {
   PetscOptionsBegin(comm, NULL, "libCEED example using PETSc with DMSwarm", NULL);
 
   PetscCall(PetscOptionsBool("-test", "Testing mode (do not print unless error is large)", NULL, test_mode, &test_mode, NULL));
-  PetscCall(
-      PetscOptionsBool("-u_petsc_swarm_view", "View XDMF of swarm values interpolated by PETSc", NULL, view_petsc_swarm, &view_petsc_swarm, NULL));
-  PetscCall(
-      PetscOptionsBool("-u_ceed_swarm_view", "View XDMF of swarm values interpolated by libCEED", NULL, view_ceed_swarm, &view_ceed_swarm, NULL));
+  PetscCall(PetscOptionsBool("-u_petsc_swarm_view", "View XDMF of swarm values interpolated by PETSc", NULL, view_petsc_swarm, &view_petsc_swarm,
+                             NULL));
+  PetscCall(PetscOptionsBool("-u_ceed_swarm_view", "View XDMF of swarm values interpolated by libCEED", NULL, view_ceed_swarm, &view_ceed_swarm,
+                             NULL));
   PetscCall(PetscOptionsEnum("-target", "Target field function", NULL, target_types, (PetscEnum)target_type, (PetscEnum *)&target_type, NULL));
   PetscCall(PetscOptionsInt("-solution_order", "Order of mesh solution space", NULL, solution_order, &solution_order, NULL));
   PetscCall(PetscOptionsInt("-mesh_order", "Order of mesh coordinate space", NULL, mesh_order, &mesh_order, NULL));
@@ -398,7 +398,7 @@ PetscErrorCode DMSwarmInterpolateFromCellToSwarm_Petsc(DM dm_swarm, const char *
     PetscCall(DMRestoreWorkArray(dm_mesh, num_points_in_cell * dim, MPIU_REAL, &coords_points_cell_true));
     PetscCall(DMRestoreWorkArray(dm_mesh, num_points_in_cell * dim, MPIU_REAL, &coords_points_cell_ref));
     PetscCall(PetscTabulationDestroy(&tabulation));
-    PetscCall(PetscFree(points_cell));
+    PetscCall(DMSwarmSortRestorePointsPerCell(dm_swarm, cell, &num_points_in_cell, &points_cell));
   }
 
   // Cleanup
@@ -486,7 +486,7 @@ PetscErrorCode DMSwarmCheckSwarmValues(DM dm_swarm, const char *field, PetscScal
     }
 
     // -- Cleanup
-    PetscCall(PetscFree(points));
+    PetscCall(DMSwarmSortRestorePointsPerCell(dm_swarm, cell, &num_points_in_cell, &points));
   }
 
   // Cleanup
diff --git a/examples/petsc/include/areaproblemdata.h b/examples/petsc/include/areaproblemdata.h
index cb5a254085..5820409159 100644
--- a/examples/petsc/include/areaproblemdata.h
+++ b/examples/petsc/include/areaproblemdata.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/bpsproblemdata.h b/examples/petsc/include/bpsproblemdata.h
index f89aadc318..9525216d0f 100644
--- a/examples/petsc/include/bpsproblemdata.h
+++ b/examples/petsc/include/bpsproblemdata.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -14,7 +14,9 @@
 
 #include "../include/structs.h"
 #include "../qfunctions/bps/bp1.h"
+#include "../qfunctions/bps/bp13.h"
 #include "../qfunctions/bps/bp2.h"
+#include "../qfunctions/bps/bp24.h"
 #include "../qfunctions/bps/bp3.h"
 #include "../qfunctions/bps/bp4.h"
 #include "../qfunctions/bps/common.h"
@@ -23,107 +25,175 @@
 // BP Option Data
 // -----------------------------------------------------------------------------
 
-BPData bp_options[6] = {
-    [CEED_BP1] = {.num_comp_u    = 1,
-                  .num_comp_x    = 3,
-                  .topo_dim      = 3,
-                  .q_data_size   = 1,
-                  .q_extra       = 1,
-                  .setup_geo     = SetupMassGeo,
-                  .setup_rhs     = SetupMassRhs,
-                  .apply         = Mass,
-                  .error         = Error,
-                  .setup_geo_loc = SetupMassGeo_loc,
-                  .setup_rhs_loc = SetupMassRhs_loc,
-                  .apply_loc     = Mass_loc,
-                  .error_loc     = Error_loc,
-                  .in_mode       = CEED_EVAL_INTERP,
-                  .out_mode      = CEED_EVAL_INTERP,
-                  .q_mode        = CEED_GAUSS,
-                  .enforce_bc    = PETSC_FALSE},
-    [CEED_BP2] = {.num_comp_u    = 3,
-                  .num_comp_x    = 3,
-                  .topo_dim      = 3,
-                  .q_data_size   = 1,
-                  .q_extra       = 1,
-                  .setup_geo     = SetupMassGeo,
-                  .setup_rhs     = SetupMassRhs3,
-                  .apply         = Mass3,
-                  .error         = Error3,
-                  .setup_geo_loc = SetupMassGeo_loc,
-                  .setup_rhs_loc = SetupMassRhs3_loc,
-                  .apply_loc     = Mass3_loc,
-                  .error_loc     = Error3_loc,
-                  .in_mode       = CEED_EVAL_INTERP,
-                  .out_mode      = CEED_EVAL_INTERP,
-                  .q_mode        = CEED_GAUSS,
-                  .enforce_bc    = PETSC_FALSE},
-    [CEED_BP3] = {.num_comp_u    = 1,
-                  .num_comp_x    = 3,
-                  .topo_dim      = 3,
-                  .q_data_size   = 7,
-                  .q_extra       = 1,
-                  .setup_geo     = SetupDiffGeo,
-                  .setup_rhs     = SetupDiffRhs,
-                  .apply         = Diff,
-                  .error         = Error,
-                  .setup_geo_loc = SetupDiffGeo_loc,
-                  .setup_rhs_loc = SetupDiffRhs_loc,
-                  .apply_loc     = Diff_loc,
-                  .error_loc     = Error_loc,
-                  .in_mode       = CEED_EVAL_GRAD,
-                  .out_mode      = CEED_EVAL_GRAD,
-                  .q_mode        = CEED_GAUSS,
-                  .enforce_bc    = PETSC_TRUE },
-    [CEED_BP4] = {.num_comp_u    = 3,
-                  .num_comp_x    = 3,
-                  .topo_dim      = 3,
-                  .q_data_size   = 7,
-                  .q_extra       = 1,
-                  .setup_geo     = SetupDiffGeo,
-                  .setup_rhs     = SetupDiffRhs3,
-                  .apply         = Diff3,
-                  .error         = Error3,
-                  .setup_geo_loc = SetupDiffGeo_loc,
-                  .setup_rhs_loc = SetupDiffRhs3_loc,
-                  .apply_loc     = Diff3_loc,
-                  .error_loc     = Error3_loc,
-                  .in_mode       = CEED_EVAL_GRAD,
-                  .out_mode      = CEED_EVAL_GRAD,
-                  .q_mode        = CEED_GAUSS,
-                  .enforce_bc    = PETSC_TRUE },
-    [CEED_BP5] = {.num_comp_u    = 1,
-                  .num_comp_x    = 3,
-                  .topo_dim      = 3,
-                  .q_data_size   = 7,
-                  .q_extra       = 0,
-                  .setup_geo     = SetupDiffGeo,
-                  .setup_rhs     = SetupDiffRhs,
-                  .apply         = Diff,
-                  .error         = Error,
-                  .setup_geo_loc = SetupDiffGeo_loc,
-                  .setup_rhs_loc = SetupDiffRhs_loc,
-                  .apply_loc     = Diff_loc,
-                  .error_loc     = Error_loc,
-                  .in_mode       = CEED_EVAL_GRAD,
-                  .out_mode      = CEED_EVAL_GRAD,
-                  .q_mode        = CEED_GAUSS_LOBATTO,
-                  .enforce_bc    = PETSC_TRUE },
-    [CEED_BP6] = {.num_comp_u    = 3,
-                  .num_comp_x    = 3,
-                  .topo_dim      = 3,
-                  .q_data_size   = 7,
-                  .q_extra       = 0,
-                  .setup_geo     = SetupDiffGeo,
-                  .setup_rhs     = SetupDiffRhs3,
-                  .apply         = Diff3,
-                  .error         = Error3,
-                  .setup_geo_loc = SetupDiffGeo_loc,
-                  .setup_rhs_loc = SetupDiffRhs3_loc,
-                  .apply_loc     = Diff3_loc,
-                  .error_loc     = Error3_loc,
-                  .in_mode       = CEED_EVAL_GRAD,
-                  .out_mode      = CEED_EVAL_GRAD,
-                  .q_mode        = CEED_GAUSS_LOBATTO,
-                  .enforce_bc    = PETSC_TRUE }
+BPData bp_options[10] = {
+    [CEED_BP1]  = {.num_comp_u    = 1,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 1,
+                   .q_extra       = 1,
+                   .setup_geo     = SetupMassGeo,
+                   .setup_rhs     = SetupMassRhs,
+                   .apply         = Mass,
+                   .error         = Error,
+                   .setup_geo_loc = SetupMassGeo_loc,
+                   .setup_rhs_loc = SetupMassRhs_loc,
+                   .apply_loc     = Mass_loc,
+                   .error_loc     = Error_loc,
+                   .in_mode       = CEED_EVAL_INTERP,
+                   .out_mode      = CEED_EVAL_INTERP,
+                   .q_mode        = CEED_GAUSS,
+                   .enforce_bc    = PETSC_FALSE},
+    [CEED_BP2]  = {.num_comp_u    = 3,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 1,
+                   .q_extra       = 1,
+                   .setup_geo     = SetupMassGeo,
+                   .setup_rhs     = SetupMassRhs3,
+                   .apply         = Mass3,
+                   .error         = Error3,
+                   .setup_geo_loc = SetupMassGeo_loc,
+                   .setup_rhs_loc = SetupMassRhs3_loc,
+                   .apply_loc     = Mass3_loc,
+                   .error_loc     = Error3_loc,
+                   .in_mode       = CEED_EVAL_INTERP,
+                   .out_mode      = CEED_EVAL_INTERP,
+                   .q_mode        = CEED_GAUSS,
+                   .enforce_bc    = PETSC_FALSE},
+    [CEED_BP3]  = {.num_comp_u    = 1,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 1,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupDiffRhs,
+                   .apply         = Diff,
+                   .error         = Error,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupDiffRhs_loc,
+                   .apply_loc     = Diff_loc,
+                   .error_loc     = Error_loc,
+                   .in_mode       = CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP4]  = {.num_comp_u    = 3,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 1,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupDiffRhs3,
+                   .apply         = Diff3,
+                   .error         = Error3,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupDiffRhs3_loc,
+                   .apply_loc     = Diff3_loc,
+                   .error_loc     = Error3_loc,
+                   .in_mode       = CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP5]  = {.num_comp_u    = 1,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 0,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupDiffRhs,
+                   .apply         = Diff,
+                   .error         = Error,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupDiffRhs_loc,
+                   .apply_loc     = Diff_loc,
+                   .error_loc     = Error_loc,
+                   .in_mode       = CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS_LOBATTO,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP6]  = {.num_comp_u    = 3,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 0,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupDiffRhs3,
+                   .apply         = Diff3,
+                   .error         = Error3,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupDiffRhs3_loc,
+                   .apply_loc     = Diff3_loc,
+                   .error_loc     = Error3_loc,
+                   .in_mode       = CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS_LOBATTO,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP13] = {.num_comp_u    = 1,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 1,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupMassDiffRhs,
+                   .apply         = MassDiff,
+                   .error         = Error,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupMassDiffRhs_loc,
+                   .apply_loc     = MassDiff_loc,
+                   .error_loc     = Error_loc,
+                   .in_mode       = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP24] = {.num_comp_u    = 3,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 1,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupMassDiffRhs3,
+                   .apply         = MassDiff3,
+                   .error         = Error3,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupMassDiffRhs3_loc,
+                   .apply_loc     = MassDiff3_loc,
+                   .error_loc     = Error3_loc,
+                   .in_mode       = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP15] = {.num_comp_u    = 1,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 0,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupMassDiffRhs,
+                   .apply         = MassDiff,
+                   .error         = Error,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupMassDiffRhs_loc,
+                   .apply_loc     = MassDiff_loc,
+                   .error_loc     = Error_loc,
+                   .in_mode       = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS_LOBATTO,
+                   .enforce_bc    = PETSC_TRUE },
+    [CEED_BP26] = {.num_comp_u    = 3,
+                   .num_comp_x    = 3,
+                   .topo_dim      = 3,
+                   .q_data_size   = 7,
+                   .q_extra       = 0,
+                   .setup_geo     = SetupDiffGeo,
+                   .setup_rhs     = SetupMassDiffRhs3,
+                   .apply         = MassDiff3,
+                   .error         = Error3,
+                   .setup_geo_loc = SetupDiffGeo_loc,
+                   .setup_rhs_loc = SetupMassDiffRhs3_loc,
+                   .apply_loc     = MassDiff3_loc,
+                   .error_loc     = Error3_loc,
+                   .in_mode       = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .out_mode      = CEED_EVAL_INTERP + CEED_EVAL_GRAD,
+                   .q_mode        = CEED_GAUSS_LOBATTO,
+                   .enforce_bc    = PETSC_TRUE },
 };
diff --git a/examples/petsc/include/libceedsetup.h b/examples/petsc/include/libceedsetup.h
index 611c30eb9a..c87130e923 100644
--- a/examples/petsc/include/libceedsetup.h
+++ b/examples/petsc/include/libceedsetup.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,8 +16,8 @@
 
 PetscErrorCode CeedDataDestroy(CeedInt i, CeedData data);
 PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt topo_dim, CeedInt q_extra, PetscInt num_comp_x, PetscInt num_comp_u,
-                                    PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, CeedVector rhs_ceed,
-                                    CeedVector *target);
+                                    PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, PetscBool is_fine_level,
+                                    CeedVector rhs_ceed, CeedVector *target);
 PetscErrorCode CeedLevelTransferSetup(DM dm, Ceed ceed, CeedInt level, CeedInt num_comp_u, CeedData *data, BPData bp_data, Vec fine_mult);
 PetscErrorCode SetupErrorOperator(DM dm, Ceed ceed, BPData bp_data, CeedInt topo_dim, PetscInt num_comp_x, PetscInt num_comp_u,
                                   CeedOperator *op_error);
diff --git a/examples/petsc/include/matops.h b/examples/petsc/include/matops.h
index 8c29f9e76a..d9e03b6f6d 100644
--- a/examples/petsc/include/matops.h
+++ b/examples/petsc/include/matops.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/petscutils.h b/examples/petsc/include/petscutils.h
index b1b07f7672..0f1f5d0ad6 100644
--- a/examples/petsc/include/petscutils.h
+++ b/examples/petsc/include/petscutils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/petscversion.h b/examples/petsc/include/petscversion.h
index 8c1d3f92be..426aeae2ba 100644
--- a/examples/petsc/include/petscversion.h
+++ b/examples/petsc/include/petscversion.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -9,6 +9,6 @@
 /// Petsc version check
 #pragma once
 
-#if PETSC_VERSION_LT(3, 21, 0)
-#error "PETSc v3.21 or later is required"
+#if PETSC_VERSION_LT(3, 23, 0)
+#error "PETSc v3.23 or later is required"
 #endif
diff --git a/examples/petsc/include/sphereproblemdata.h b/examples/petsc/include/sphereproblemdata.h
index 5142d9eeba..4a63deea05 100644
--- a/examples/petsc/include/sphereproblemdata.h
+++ b/examples/petsc/include/sphereproblemdata.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/include/structs.h b/examples/petsc/include/structs.h
index c33ad80b9b..8b2647fe16 100644
--- a/examples/petsc/include/structs.h
+++ b/examples/petsc/include/structs.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -65,7 +65,18 @@ typedef struct {
 } BPData;
 
 // BP options
-typedef enum { CEED_BP1 = 0, CEED_BP2 = 1, CEED_BP3 = 2, CEED_BP4 = 3, CEED_BP5 = 4, CEED_BP6 = 5 } BPType;
+typedef enum {
+  CEED_BP1  = 0,
+  CEED_BP2  = 1,
+  CEED_BP3  = 2,
+  CEED_BP4  = 3,
+  CEED_BP5  = 4,
+  CEED_BP6  = 5,
+  CEED_BP13 = 6,
+  CEED_BP24 = 7,
+  CEED_BP15 = 8,
+  CEED_BP26 = 9,
+} BPType;
 
 // -----------------------------------------------------------------------------
 // Parameter structure for running problems
diff --git a/examples/petsc/include/swarmutils.h b/examples/petsc/include/swarmutils.h
index 0eeff6e301..4beed9bef1 100644
--- a/examples/petsc/include/swarmutils.h
+++ b/examples/petsc/include/swarmutils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/petsc/multigrid.c b/examples/petsc/multigrid.c
index e9f78197a3..1bce6a318a 100644
--- a/examples/petsc/multigrid.c
+++ b/examples/petsc/multigrid.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -117,19 +117,21 @@ int main(int argc, char **argv) {
   if (read_mesh) {
     PetscCall(DMPlexCreateFromFile(PETSC_COMM_WORLD, filename, NULL, PETSC_TRUE, &dm_orig));
   } else {
-    PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, dim, simplex, mesh_elem, NULL, NULL, NULL, PETSC_TRUE, &dm_orig));
+    PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, dim, simplex, mesh_elem, NULL, NULL, NULL, PETSC_TRUE, 0, PETSC_FALSE, &dm_orig));
   }
 
-  VecType vec_type;
+  // Set mesh vec_type
+  VecType vec_type = VECSTANDARD;
+
   switch (mem_type_backend) {
     case CEED_MEM_HOST:
       vec_type = VECSTANDARD;
       break;
     case CEED_MEM_DEVICE: {
       const char *resolved;
+
       CeedGetResource(ceed, &resolved);
       if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA;
-      else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD;  // https://github.com/CEED/libCEED/issues/678
       else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP;
       else vec_type = VECSTANDARD;
     }
@@ -155,10 +157,10 @@ int main(int argc, char **argv) {
 
   switch (coarsen) {
     case COARSEN_UNIFORM:
-      for (int i = 0; i < num_levels; i++) level_degrees[i] = i + 1;
+      for (PetscInt i = 0; i < num_levels; i++) level_degrees[i] = i + 1;
       break;
     case COARSEN_LOGARITHMIC:
-      for (int i = 0; i < num_levels - 1; i++) level_degrees[i] = pow(2, i);
+      for (PetscInt i = 0; i < num_levels - 1; i++) level_degrees[i] = pow(2, i);
       level_degrees[fine_level] = degree;
       break;
   }
@@ -181,7 +183,7 @@ int main(int argc, char **argv) {
   CeedElemTopology elem_topo = ElemTopologyP2C(cell_type);
 
   // Setup DM and Operator Mat Shells for each level
-  for (CeedInt i = 0; i < num_levels; i++) {
+  for (PetscInt i = 0; i < num_levels; i++) {
     // Create DM
     PetscCall(DMClone(dm_orig, &dm[i]));
     PetscCall(DMGetVecType(dm_orig, &vec_type));
@@ -199,7 +201,6 @@ int main(int argc, char **argv) {
 
     // Operator
     PetscCall(PetscMalloc1(1, &op_apply_ctx[i]));
-    PetscCall(PetscMalloc1(1, &op_error_ctx));
     PetscCall(MatCreateShell(comm, l_size[i], l_size[i], g_size[i], g_size[i], op_apply_ctx[i], &mat_O[i]));
     PetscCall(MatShellSetOperation(mat_O[i], MATOP_MULT, (void (*)(void))MatMult_Ceed));
     PetscCall(MatShellSetOperation(mat_O[i], MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiag));
@@ -267,7 +268,7 @@ int main(int argc, char **argv) {
     }
     PetscCall(PetscMalloc1(1, &ceed_data[i]));
     PetscCall(SetupLibceedByDegree(dm[i], ceed, level_degrees[i], dim, q_extra, dim, num_comp_u, g_size[i], xl_size[i], bp_options[bp_choice],
-                                   ceed_data[i], i == (fine_level), rhs_ceed, &target));
+                                   ceed_data[i], i == fine_level, i == fine_level, rhs_ceed, &target));
   }
 
   // Gather RHS
@@ -291,7 +292,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_error, "error", ceed_data[fine_level]->elem_restr_u, ceed_data[fine_level]->basis_u, CEED_VECTOR_ACTIVE);
 
   // Calculate multiplicity
-  for (int i = 0; i < num_levels; i++) {
+  for (PetscInt i = 0; i < num_levels; i++) {
     PetscMemType mem_type;
 
     // CEED vector
@@ -322,7 +323,7 @@ int main(int argc, char **argv) {
   }
 
   // Set up Mat
-  for (int i = 0; i < num_levels; i++) {
+  for (PetscInt i = fine_level; i >= 0; i--) {
     // Set up apply operator context
     PetscCall(SetupApplyOperatorCtx(comm, dm[i], ceed, ceed_data[i], X_loc[i], op_apply_ctx[i]));
 
@@ -335,8 +336,8 @@ int main(int argc, char **argv) {
       pr_restr_ctx[i]->loc_vec_c   = X_loc[i - 1];
       pr_restr_ctx[i]->loc_vec_f   = op_apply_ctx[i]->Y_loc;
       pr_restr_ctx[i]->mult_vec    = mult[i];
-      pr_restr_ctx[i]->ceed_vec_c  = op_apply_ctx[i - 1]->x_ceed;
-      pr_restr_ctx[i]->ceed_vec_f  = op_apply_ctx[i]->y_ceed;
+      pr_restr_ctx[i]->ceed_vec_c  = ceed_data[i - 1]->x_ceed;
+      pr_restr_ctx[i]->ceed_vec_f  = ceed_data[i]->y_ceed;
       pr_restr_ctx[i]->op_prolong  = ceed_data[i]->op_prolong;
       pr_restr_ctx[i]->op_restrict = ceed_data[i]->op_restrict;
       pr_restr_ctx[i]->ceed        = ceed;
@@ -393,7 +394,7 @@ int main(int argc, char **argv) {
 
     // PCMG levels
     PetscCall(PCMGSetLevels(pc, num_levels, NULL));
-    for (int i = 0; i < num_levels; i++) {
+    for (PetscInt i = 0; i < num_levels; i++) {
       // Smoother
       KSP smoother;
       PC  smoother_pc;
@@ -502,6 +503,7 @@ int main(int argc, char **argv) {
     }
     {
       // Set up error operator context
+      PetscCall(PetscMalloc1(1, &op_error_ctx));
       PetscCall(SetupErrorOperatorCtx(comm, dm[fine_level], ceed, ceed_data[fine_level], X_loc[fine_level], op_error, op_error_ctx));
       PetscScalar l2_error;
       PetscCall(ComputeL2Error(X[fine_level], &l2_error, op_error_ctx));
@@ -532,7 +534,7 @@ int main(int argc, char **argv) {
   }
 
   // Cleanup
-  for (int i = 0; i < num_levels; i++) {
+  for (PetscInt i = 0; i < num_levels; i++) {
     PetscCall(VecDestroy(&X[i]));
     PetscCall(VecDestroy(&X_loc[i]));
     PetscCall(VecDestroy(&mult[i]));
diff --git a/examples/petsc/qfunctions/area/areacube.h b/examples/petsc/qfunctions/area/areacube.h
index 93be0594b6..f008846f2a 100644
--- a/examples/petsc/qfunctions/area/areacube.h
+++ b/examples/petsc/qfunctions/area/areacube.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example for a scalar field on the sphere using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factor required for integration when reference coordinates have a different dimension than the one of physical
diff --git a/examples/petsc/qfunctions/area/areasphere.h b/examples/petsc/qfunctions/area/areasphere.h
index 7cd73ca354..13e5536e14 100644
--- a/examples/petsc/qfunctions/area/areasphere.h
+++ b/examples/petsc/qfunctions/area/areasphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example for a scalar field on the sphere using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factor required for integration when reference coordinates have a different dimension than the one of physical
diff --git a/examples/petsc/qfunctions/bps/bp1.h b/examples/petsc/qfunctions/bps/bp1.h
index a902b29f7c..fb35d0249e 100644
--- a/examples/petsc/qfunctions/bps/bp1.h
+++ b/examples/petsc/qfunctions/bps/bp1.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factors required to apply the mass operator
diff --git a/examples/petsc/qfunctions/bps/bp13.h b/examples/petsc/qfunctions/bps/bp13.h
new file mode 100644
index 0000000000..33d454546d
--- /dev/null
+++ b/examples/petsc/qfunctions/bps/bp13.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// libCEED QFunctions for diffusion operator example using PETSc
+
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <math.h>
+#endif
+
+// -----------------------------------------------------------------------------
+// This QFunction sets up the rhs and true solution for the problem
+// -----------------------------------------------------------------------------
+CEED_QFUNCTION(SetupMassDiffRhs)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+  const CeedScalar *x = in[0], *w = in[1];
+  CeedScalar       *true_soln = out[0], *rhs = out[1];
+
+  // Quadrature Point Loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+    const CeedScalar c[3] = {0, 1., 2.};
+    const CeedScalar k[3] = {1., 2., 3.};
+
+    true_soln[i] = sin(M_PI * (c[0] + k[0] * x[i + Q * 0])) * sin(M_PI * (c[1] + k[1] * x[i + Q * 1])) * sin(M_PI * (c[2] + k[2] * x[i + Q * 2]));
+
+    rhs[i] = w[i + Q * 0] * (M_PI * M_PI * (k[0] * k[0] + k[1] * k[1] + k[2] * k[2]) + 1.0) * true_soln[i];
+  }  // End of Quadrature Point Loop
+  return 0;
+}
+
+// -----------------------------------------------------------------------------
+// This QFunction applies the mass + diffusion operator for a scalar field.
+//
+// Inputs:
+//   u       - Input vector at quadrature points
+//   ug      - Input vector gradient at quadrature points
+//   q_data  - Geometric factors
+//
+// Output:
+//   v      - Output vector (test functions) at quadrature points
+//   vg     - Output vector (test functions) gradient at quadrature points
+// -----------------------------------------------------------------------------
+CEED_QFUNCTION(MassDiff)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar *u = in[0], *ug = in[1], *q_data = in[2];
+  CeedScalar       *v = out[0], *vg = out[1];
+
+  // Quadrature Point Loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+    // Read spatial derivatives of u
+    const CeedScalar du[3] = {ug[i + Q * 0], ug[i + Q * 1], ug[i + Q * 2]};
+    // Read q_data (dXdxdXdx_T symmetric matrix)
+    const CeedScalar dXdxdXdx_T[3][3] = {
+        {q_data[i + 1 * Q], q_data[i + 2 * Q], q_data[i + 3 * Q]},
+        {q_data[i + 2 * Q], q_data[i + 4 * Q], q_data[i + 5 * Q]},
+        {q_data[i + 3 * Q], q_data[i + 5 * Q], q_data[i + 6 * Q]}
+    };
+
+    // Mass
+    v[i] = q_data[i + 0 * Q] * u[i];
+    // Diff
+    for (int j = 0; j < 3; j++) {  // j = direction of vg
+      vg[i + j * Q] = (du[0] * dXdxdXdx_T[0][j] + du[1] * dXdxdXdx_T[1][j] + du[2] * dXdxdXdx_T[2][j]);
+    }
+  }  // End of Quadrature Point Loop
+  return 0;
+}
+// -----------------------------------------------------------------------------
diff --git a/examples/petsc/qfunctions/bps/bp1sphere.h b/examples/petsc/qfunctions/bps/bp1sphere.h
index d604406f29..394d3d6cae 100644
--- a/examples/petsc/qfunctions/bps/bp1sphere.h
+++ b/examples/petsc/qfunctions/bps/bp1sphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example for a scalar field on the sphere using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factors required for integration and coordinate transformations when reference coordinates have a different
diff --git a/examples/petsc/qfunctions/bps/bp2.h b/examples/petsc/qfunctions/bps/bp2.h
index 22ba9fb788..21da3ec39a 100644
--- a/examples/petsc/qfunctions/bps/bp2.h
+++ b/examples/petsc/qfunctions/bps/bp2.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the rhs and true solution for the problem
diff --git a/examples/petsc/qfunctions/bps/bp24.h b/examples/petsc/qfunctions/bps/bp24.h
new file mode 100644
index 0000000000..4870cd1cfe
--- /dev/null
+++ b/examples/petsc/qfunctions/bps/bp24.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// libCEED QFunctions for diffusion operator example using PETSc
+
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
+#include <math.h>
+#endif
+
+// -----------------------------------------------------------------------------
+// This QFunction sets up the rhs and true solution for the problem
+// -----------------------------------------------------------------------------
+CEED_QFUNCTION(SetupMassDiffRhs3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+  const CeedScalar *x = in[0], *w = in[1];
+  CeedScalar       *true_soln = out[0], *rhs = out[1];
+
+  // Quadrature Point Loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+    const CeedScalar c[3] = {0, 1., 2.};
+    const CeedScalar k[3] = {1., 2., 3.};
+
+    // Component 1
+    true_soln[i + 0 * Q] =
+        sin(M_PI * (c[0] + k[0] * x[i + Q * 0])) * sin(M_PI * (c[1] + k[1] * x[i + Q * 1])) * sin(M_PI * (c[2] + k[2] * x[i + Q * 2]));
+    // Component 2
+    true_soln[i + 1 * Q] = 2 * true_soln[i + 0 * Q];
+    // Component 3
+    true_soln[i + 2 * Q] = 3 * true_soln[i + 0 * Q];
+
+    // Component 1
+    rhs[i + 0 * Q] = w[i + Q * 0] * (M_PI * M_PI * (k[0] * k[0] + k[1] * k[1] + k[2] * k[2]) + 1.0) * true_soln[i + 0 * Q];
+    // Component 2
+    rhs[i + 1 * Q] = 2 * rhs[i + 0 * Q];
+    // Component 3
+    rhs[i + 2 * Q] = 3 * rhs[i + 0 * Q];
+  }  // End of Quadrature Point Loop
+  return 0;
+}
+
+// -----------------------------------------------------------------------------
+// This QFunction applies the mass + diffusion operator for a vector field of 3 components.
+//
+// Inputs:
+//   u       - Input vector at quadrature points
+//   ug      - Input vector Jacobian at quadrature points
+//   q_data  - Geometric factors
+//
+// Output:
+//   v      - Output vector (test functions) at quadrature points
+//   vJ     - Output vector (test functions) Jacobian at quadrature points
+// -----------------------------------------------------------------------------
+CEED_QFUNCTION(MassDiff3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar *u = in[0], *ug = in[1], *q_data = in[2];
+  CeedScalar       *v = out[0], *vg = out[1];
+
+  // Quadrature Point Loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+    // Read spatial derivatives of u components
+    const CeedScalar uJ[3][3] = {
+        {ug[i + (0 + 0 * 3) * Q], ug[i + (0 + 1 * 3) * Q], ug[i + (0 + 2 * 3) * Q]},
+        {ug[i + (1 + 0 * 3) * Q], ug[i + (1 + 1 * 3) * Q], ug[i + (1 + 2 * 3) * Q]},
+        {ug[i + (2 + 0 * 3) * Q], ug[i + (2 + 1 * 3) * Q], ug[i + (2 + 2 * 3) * Q]}
+    };
+    // Read q_data (dXdxdXdx_T symmetric matrix)
+    const CeedScalar dXdxdXdx_T[3][3] = {
+        {q_data[i + 1 * Q], q_data[i + 2 * Q], q_data[i + 3 * Q]},
+        {q_data[i + 2 * Q], q_data[i + 4 * Q], q_data[i + 5 * Q]},
+        {q_data[i + 3 * Q], q_data[i + 5 * Q], q_data[i + 6 * Q]}
+    };
+
+    for (int k = 0; k < 3; k++) {  // k = component
+      // Mass
+      v[i + k * Q] = q_data[i + 0 * Q] * u[i + k * Q];
+      // Diff
+      for (int j = 0; j < 3; j++) {  // j = direction of vg
+        vg[i + (k + j * 3) * Q] = (uJ[k][0] * dXdxdXdx_T[0][j] + uJ[k][1] * dXdxdXdx_T[1][j] + uJ[k][2] * dXdxdXdx_T[2][j]);
+      }
+    }
+  }  // End of Quadrature Point Loop
+
+  return 0;
+}
+// -----------------------------------------------------------------------------
diff --git a/examples/petsc/qfunctions/bps/bp2sphere.h b/examples/petsc/qfunctions/bps/bp2sphere.h
index 36a8e95778..aa08525c86 100644
--- a/examples/petsc/qfunctions/bps/bp2sphere.h
+++ b/examples/petsc/qfunctions/bps/bp2sphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example for a vector field on the sphere using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the rhs and true solution for the problem
diff --git a/examples/petsc/qfunctions/bps/bp3.h b/examples/petsc/qfunctions/bps/bp3.h
index dcf84defae..153ad6e021 100644
--- a/examples/petsc/qfunctions/bps/bp3.h
+++ b/examples/petsc/qfunctions/bps/bp3.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for diffusion operator example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factors required to apply the diffusion operator
@@ -87,7 +89,6 @@ CEED_QFUNCTION(SetupDiffRhs)(void *ctx, CeedInt Q, const CeedScalar *const *in,
 
     rhs[i] = w[i + Q * 0] * M_PI * M_PI * (k[0] * k[0] + k[1] * k[1] + k[2] * k[2]) * true_soln[i];
   }  // End of Quadrature Point Loop
-
   return 0;
 }
 
diff --git a/examples/petsc/qfunctions/bps/bp3sphere.h b/examples/petsc/qfunctions/bps/bp3sphere.h
index 1f901dd97a..911e14d0ac 100644
--- a/examples/petsc/qfunctions/bps/bp3sphere.h
+++ b/examples/petsc/qfunctions/bps/bp3sphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for diffusion operator example for a scalar field on the sphere using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factors required for integration and coordinate transformations when reference coordinates have a different
diff --git a/examples/petsc/qfunctions/bps/bp4.h b/examples/petsc/qfunctions/bps/bp4.h
index 46307c338a..0ccad57d68 100644
--- a/examples/petsc/qfunctions/bps/bp4.h
+++ b/examples/petsc/qfunctions/bps/bp4.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for diffusion operator example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the rhs and true solution for the problem
@@ -41,7 +43,6 @@ CEED_QFUNCTION(SetupDiffRhs3)(void *ctx, CeedInt Q, const CeedScalar *const *in,
     // Component 3
     rhs[i + 2 * Q] = 3 * rhs[i + 0 * Q];
   }  // End of Quadrature Point Loop
-
   return 0;
 }
 
@@ -56,7 +57,7 @@ CEED_QFUNCTION(SetupDiffRhs3)(void *ctx, CeedInt Q, const CeedScalar *const *in,
 //   vJ     - Output vector (test functions) Jacobian at quadrature points
 // -----------------------------------------------------------------------------
 CEED_QFUNCTION(Diff3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  const CeedScalar *ug = in[0], *qd = in[1];
+  const CeedScalar *ug = in[0], *q_data = in[1];
   CeedScalar       *vg = out[0];
 
   // Quadrature Point Loop
@@ -69,9 +70,9 @@ CEED_QFUNCTION(Diff3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSca
     };
     // Read q_data (dXdxdXdx_T symmetric matrix)
     const CeedScalar dXdxdXdx_T[3][3] = {
-        {qd[i + 1 * Q], qd[i + 2 * Q], qd[i + 3 * Q]},
-        {qd[i + 2 * Q], qd[i + 4 * Q], qd[i + 5 * Q]},
-        {qd[i + 3 * Q], qd[i + 5 * Q], qd[i + 6 * Q]}
+        {q_data[i + 1 * Q], q_data[i + 2 * Q], q_data[i + 3 * Q]},
+        {q_data[i + 2 * Q], q_data[i + 4 * Q], q_data[i + 5 * Q]},
+        {q_data[i + 3 * Q], q_data[i + 5 * Q], q_data[i + 6 * Q]}
     };
 
     for (int k = 0; k < 3; k++) {    // k = component
@@ -80,7 +81,6 @@ CEED_QFUNCTION(Diff3)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedSca
       }
     }
   }  // End of Quadrature Point Loop
-
   return 0;
 }
 // -----------------------------------------------------------------------------
diff --git a/examples/petsc/qfunctions/bps/bp4sphere.h b/examples/petsc/qfunctions/bps/bp4sphere.h
index 517f353371..43b4806afe 100644
--- a/examples/petsc/qfunctions/bps/bp4sphere.h
+++ b/examples/petsc/qfunctions/bps/bp4sphere.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// libCEED QFunctions for mass operator example for a vector field on the sphere using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the rhs and true solution for the problem
diff --git a/examples/petsc/qfunctions/bps/common.h b/examples/petsc/qfunctions/bps/common.h
index 26f374d5d4..09cccd5840 100644
--- a/examples/petsc/qfunctions/bps/common.h
+++ b/examples/petsc/qfunctions/bps/common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,7 +8,7 @@
 /// @file
 /// libCEED QFunctions for BP examples using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 // -----------------------------------------------------------------------------
 CEED_QFUNCTION(Error)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
diff --git a/examples/petsc/qfunctions/swarm/swarmmass.h b/examples/petsc/qfunctions/swarm/swarmmass.h
index e355eff8d7..1b6fa1e21c 100644
--- a/examples/petsc/qfunctions/swarm/swarmmass.h
+++ b/examples/petsc/qfunctions/swarm/swarmmass.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(SetupMass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
diff --git a/examples/petsc/src/libceedsetup.c b/examples/petsc/src/libceedsetup.c
index 086fb669c7..8f8323e7a6 100644
--- a/examples/petsc/src/libceedsetup.c
+++ b/examples/petsc/src/libceedsetup.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -40,14 +40,14 @@ PetscErrorCode CeedDataDestroy(CeedInt i, CeedData data) {
 // Set up libCEED for a given degree
 // -----------------------------------------------------------------------------
 PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt topo_dim, CeedInt q_extra, PetscInt num_comp_x, PetscInt num_comp_u,
-                                    PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, CeedVector rhs_ceed,
-                                    CeedVector *target) {
+                                    PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, PetscBool is_fine_level,
+                                    CeedVector rhs_ceed, CeedVector *target) {
   DM                  dm_coord;
   Vec                 coords;
   const PetscScalar  *coord_array;
   CeedBasis           basis_x, basis_u;
   CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_u_i, elem_restr_qd_i;
-  CeedQFunction       qf_setup_geo, qf_apply;
+  CeedQFunction       qf_setup_geo = NULL, qf_apply = NULL;
   CeedOperator        op_setup_geo, op_apply;
   CeedVector          x_coord, q_data, x_ceed, y_ceed;
   PetscInt            c_start, c_end, num_elem;
@@ -86,36 +86,64 @@ PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt to
   CeedVectorCreate(ceed, xl_size, &x_ceed);
   CeedVectorCreate(ceed, xl_size, &y_ceed);
 
-  // Create the QFunction that builds the context data
-  CeedQFunctionCreateInterior(ceed, 1, bp_data.setup_geo, bp_data.setup_geo_loc, &qf_setup_geo);
-  CeedQFunctionAddInput(qf_setup_geo, "x", num_comp_x, CEED_EVAL_INTERP);
-  CeedQFunctionAddInput(qf_setup_geo, "dx", num_comp_x * topo_dim, CEED_EVAL_GRAD);
-  CeedQFunctionAddInput(qf_setup_geo, "weight", 1, CEED_EVAL_WEIGHT);
-  CeedQFunctionAddOutput(qf_setup_geo, "qdata", q_data_size, CEED_EVAL_NONE);
-
-  // Create the operator that builds the quadrature data
-  CeedOperatorCreate(ceed, qf_setup_geo, NULL, NULL, &op_setup_geo);
-  CeedOperatorSetField(op_setup_geo, "x", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetField(op_setup_geo, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetField(op_setup_geo, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
-  CeedOperatorSetField(op_setup_geo, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
-
-  // Setup q_data
-  CeedOperatorApply(op_setup_geo, x_coord, q_data, CEED_REQUEST_IMMEDIATE);
-
-  // Set up PDE operator
-  CeedInt in_scale  = bp_data.in_mode == CEED_EVAL_GRAD ? topo_dim : 1;
-  CeedInt out_scale = bp_data.out_mode == CEED_EVAL_GRAD ? topo_dim : 1;
-  CeedQFunctionCreateInterior(ceed, 1, bp_data.apply, bp_data.apply_loc, &qf_apply);
-  CeedQFunctionAddInput(qf_apply, "u", num_comp_u * in_scale, bp_data.in_mode);
-  CeedQFunctionAddInput(qf_apply, "qdata", q_data_size, CEED_EVAL_NONE);
-  CeedQFunctionAddOutput(qf_apply, "v", num_comp_u * out_scale, bp_data.out_mode);
+  if (is_fine_level) {
+    // Create the QFunction that builds the context data
+    CeedQFunctionCreateInterior(ceed, 1, bp_data.setup_geo, bp_data.setup_geo_loc, &qf_setup_geo);
+    CeedQFunctionAddInput(qf_setup_geo, "x", num_comp_x, CEED_EVAL_INTERP);
+    CeedQFunctionAddInput(qf_setup_geo, "dx", num_comp_x * topo_dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddInput(qf_setup_geo, "weight", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddOutput(qf_setup_geo, "qdata", q_data_size, CEED_EVAL_NONE);
+
+    // Create the operator that builds the quadrature data
+    CeedOperatorCreate(ceed, qf_setup_geo, NULL, NULL, &op_setup_geo);
+    CeedOperatorSetField(op_setup_geo, "x", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_setup_geo, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_setup_geo, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+    CeedOperatorSetField(op_setup_geo, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+
+    // Setup q_data
+    CeedOperatorApply(op_setup_geo, x_coord, q_data, CEED_REQUEST_IMMEDIATE);
+
+    // Set up PDE operator
+    PetscBool is_interp = bp_data.in_mode == CEED_EVAL_INTERP;
+    CeedInt   in_scale  = bp_data.in_mode == CEED_EVAL_GRAD ? topo_dim : 1;
+    CeedInt   out_scale = bp_data.out_mode == CEED_EVAL_GRAD ? topo_dim : 1;
+
+    CeedQFunctionCreateInterior(ceed, 1, bp_data.apply, bp_data.apply_loc, &qf_apply);
+    if (bp_data.in_mode == CEED_EVAL_INTERP + CEED_EVAL_GRAD) {
+      CeedQFunctionAddInput(qf_apply, "u", num_comp_u, CEED_EVAL_INTERP);
+      CeedQFunctionAddInput(qf_apply, "du", num_comp_u * topo_dim, CEED_EVAL_GRAD);
+    } else {
+      CeedQFunctionAddInput(qf_apply, is_interp ? "u" : "du", num_comp_u * in_scale, bp_data.in_mode);
+    }
+    CeedQFunctionAddInput(qf_apply, "qdata", q_data_size, CEED_EVAL_NONE);
+    if (bp_data.out_mode == CEED_EVAL_INTERP + CEED_EVAL_GRAD) {
+      CeedQFunctionAddOutput(qf_apply, "v", num_comp_u, CEED_EVAL_INTERP);
+      CeedQFunctionAddOutput(qf_apply, "dv", num_comp_u * topo_dim, CEED_EVAL_GRAD);
+    } else {
+      CeedQFunctionAddOutput(qf_apply, is_interp ? "v" : "dv", num_comp_u * out_scale, bp_data.out_mode);
+    }
+
+    // Create the mass or diff operator
+    CeedOperatorCreate(ceed, qf_apply, NULL, NULL, &op_apply);
+    if (bp_data.in_mode == CEED_EVAL_INTERP + CEED_EVAL_GRAD) {
+      CeedOperatorSetField(op_apply, "u", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+      CeedOperatorSetField(op_apply, "du", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    } else {
+      CeedOperatorSetField(op_apply, is_interp ? "u" : "du", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    }
+    CeedOperatorSetField(op_apply, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, q_data);
+    if (bp_data.out_mode == CEED_EVAL_INTERP + CEED_EVAL_GRAD) {
+      CeedOperatorSetField(op_apply, "v", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+      CeedOperatorSetField(op_apply, "dv", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    } else {
+      CeedOperatorSetField(op_apply, is_interp ? "v" : "dv", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    }
 
-  // Create the mass or diff operator
-  CeedOperatorCreate(ceed, qf_apply, NULL, NULL, &op_apply);
-  CeedOperatorSetField(op_apply, "u", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
-  CeedOperatorSetField(op_apply, "qdata", elem_restr_qd_i, CEED_BASIS_NONE, q_data);
-  CeedOperatorSetField(op_apply, "v", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE);
+    // Cleanup
+    CeedQFunctionDestroy(&qf_setup_geo);
+    CeedOperatorDestroy(&op_setup_geo);
+  }
 
   // Set up RHS if needed
   if (setup_rhs) {
@@ -151,10 +179,7 @@ PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt to
     CeedQFunctionDestroy(&qf_setup_rhs);
     CeedOperatorDestroy(&op_setup_rhs);
   }
-
   // Cleanup
-  CeedQFunctionDestroy(&qf_setup_geo);
-  CeedOperatorDestroy(&op_setup_geo);
   CeedVectorDestroy(&x_coord);
 
   // Save libCEED data required for level
diff --git a/examples/petsc/src/petscutils.c b/examples/petsc/src/petscutils.c
index e8fdc4fac3..1c4076ed10 100644
--- a/examples/petsc/src/petscutils.c
+++ b/examples/petsc/src/petscutils.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -425,7 +425,7 @@ PetscErrorCode CreateDistributedDM(RunParams rp, DM *dm) {
       }
     }
 
-    PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, rp->dim, rp->simplex, rp->mesh_elem, NULL, NULL, NULL, PETSC_TRUE, dm));
+    PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, rp->dim, rp->simplex, rp->mesh_elem, NULL, NULL, NULL, PETSC_TRUE, 0, PETSC_FALSE, dm));
   }
 
   PetscCall(DMSetFromOptions(*dm));
diff --git a/examples/petsc/src/swarmutils.c b/examples/petsc/src/swarmutils.c
index f736581ee5..21339ae9d6 100644
--- a/examples/petsc/src/swarmutils.c
+++ b/examples/petsc/src/swarmutils.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -391,7 +391,7 @@ PetscErrorCode DMSwarmCreateReferenceCoordinates(DM dm_swarm, IS *is_points, Vec
     }
 
     // -- Cleanup
-    PetscCall(PetscFree(points_in_cell));
+    PetscCall(DMSwarmSortRestorePointsPerCell(dm_swarm, cell, &num_points_in_cell, &points_in_cell));
   }
   cell_points[points_offset - 1] = num_points_local + points_offset;
 
@@ -617,6 +617,7 @@ PetscErrorCode SetupProblemSwarm(DM dm_swarm, Ceed ceed, BPData bp_data, CeedDat
   // Swarm objects
   {
     const PetscInt *cell_points;
+    CeedInt        *offsets;
     IS              is_points;
     Vec             X_ref;
     CeedInt         num_elem;
@@ -628,7 +629,7 @@ PetscErrorCode SetupProblemSwarm(DM dm_swarm, Ceed ceed, BPData bp_data, CeedDat
 
     PetscCall(ISGetIndices(is_points, &cell_points));
     PetscInt num_points = cell_points[num_elem + 1] - num_elem - 2;
-    CeedInt  offsets[num_elem + 1 + num_points];
+    PetscCall(PetscCalloc1(num_elem + 1 + num_points, &offsets));
 
     for (PetscInt i = 0; i < num_elem + 1; i++) offsets[i] = cell_points[i + 1] - 1;
     for (PetscInt i = num_elem + 1; i < num_points + num_elem + 1; i++) offsets[i] = cell_points[i + 1];
@@ -685,6 +686,7 @@ PetscErrorCode SetupProblemSwarm(DM dm_swarm, Ceed ceed, BPData bp_data, CeedDat
 
     // Cleanup
     PetscCall(ISDestroy(&is_points));
+    PetscCall(PetscFree(offsets));
     PetscCall(VecDestroy(&X_ref));
   }
 
diff --git a/examples/python/Makefile b/examples/python/Makefile
new file mode 100644
index 0000000000..64244ea2c1
--- /dev/null
+++ b/examples/python/Makefile
@@ -0,0 +1,20 @@
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+
+PYTHON ?= python3
+
+clean:
+	rm -rf build __pycache__ .pytest_cache *.so
+
+setup:
+	$(PYTHON) setup_qfunctions.py build
+
+TEST_OPTS ?= --ceed /cpu/self/ref/serial
+test: setup
+	$(PYTHON) -m pytest ex_test.py $(TEST_OPTS)
+
+.PHONY: clean setup test
diff --git a/examples/python/README.md b/examples/python/README.md
new file mode 100644
index 0000000000..ca0019ab94
--- /dev/null
+++ b/examples/python/README.md
@@ -0,0 +1,25 @@
+## libCEED Python Examples
+
+These examples are written using libCEED's Python interface.
+
+### Tutorials
+
+These Jupyter notebooks explore the concepts of the libCEED API, including how to install the Python interface and the usage of each API object, with interactive examples.
+
+### Basic Examples
+
+The basic libCEED C examples in the folder `/examples/ceed` are also available as Python examples.
+
+To build the QFunctions into a shared library that the Python examples use, run
+
+```bash
+make setup
+```
+
+To execute the examples, run:
+
+```
+python ex1_volume.py
+```
+
+A full list of command-line arguments are shown by adding the command-line argument "--help".
diff --git a/examples/python/conftest.py b/examples/python/conftest.py
new file mode 100644
index 0000000000..70bdf69cfc
--- /dev/null
+++ b/examples/python/conftest.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+
+import pytest
+
+# -------------------------------------------------------------------------------
+# Add --ceed command line argument
+# -------------------------------------------------------------------------------
+
+
+def pytest_addoption(parser):
+    parser.addoption("--ceed", action="store", default='/cpu/self/ref/blocked')
+
+
+@pytest.fixture(scope='session')
+def ceed_resource(request):
+    ceed_resource = request.config.option.ceed
+
+    return ceed_resource
+
+# -------------------------------------------------------------------------------
diff --git a/examples/python/ex1_volume.py b/examples/python/ex1_volume.py
new file mode 100644
index 0000000000..b08b7e34e3
--- /dev/null
+++ b/examples/python/ex1_volume.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+#
+# libCEED example using diffusion operator to compute surface area
+#
+# Sample runs:
+#
+#     python ex1_volume.py
+#     python ex1_volume -c /cpu/self
+#     python ex1_volume -c /gpu/cuda
+
+import sys
+import os
+import numpy as np
+import libceed
+import ex_common as common
+
+
+def main():
+    """Main function for volume example"""
+    args = common.parse_arguments()
+    return example_1(args)
+
+
+def example_1(args):
+    """Compute volume using mass operator
+
+    Args:
+        args: Parsed command line arguments
+
+    Returns:
+        int: 0 on success, error code on failure
+    """
+    # Process arguments
+    dim = args.dim
+    mesh_degree = max(args.mesh_degree, args.solution_degree)
+    sol_degree = args.solution_degree
+    num_qpts = args.quadrature_points
+    problem_size = args.problem_size if args.problem_size > 0 else (8 * 16 if args.test else 256 * 1024)
+    ncomp_x = dim  # Number of coordinate components
+
+    # Print configuration
+    if not args.quiet:
+        print("Selected options: [command line option] : <current value>")
+        print(f"    Ceed specification [-c] : {args.ceed}")
+        print(f"    Mesh dimension     [-d] : {dim}")
+        print(f"    Mesh degree        [-m] : {mesh_degree}")
+        print(f"    Solution degree    [-p] : {sol_degree}")
+        print(f"    Num. 1D quadr. pts [-q] : {num_qpts}")
+        print(f"    Approx. # unknowns [-s] : {problem_size}")
+        print(f"    QFunction source   [-g] : {'gallery' if args.gallery else 'user'}")
+
+    # Initialize CEED
+    ceed = libceed.Ceed(args.ceed)
+
+    # Create bases
+    # Tensor-product Lagrange basis for mesh coordinates
+    mesh_basis = ceed.BasisTensorH1Lagrange(
+        dim, ncomp_x, mesh_degree + 1, num_qpts, libceed.GAUSS)
+
+    # Tensor-product Lagrange basis for solution
+    solution_basis = ceed.BasisTensorH1Lagrange(
+        dim, 1, sol_degree + 1, num_qpts, libceed.GAUSS)
+
+    # Create mesh
+    # Determine mesh size
+    num_xyz = common.get_cartesian_mesh_size(dim, sol_degree, problem_size)
+    if not args.quiet:
+        print("\nMesh size                   : nx = %d" % num_xyz[0], end="")
+        if dim > 1:
+            print(", ny = %d" % num_xyz[1], end="")
+        if dim > 2:
+            print(", nz = %d" % num_xyz[2], end="")
+        print()
+
+    # Create element restrictions
+    num_q_comp = 1
+    mesh_restriction, mesh_size, _, _, _ = common.build_cartesian_restriction(
+        ceed, dim, num_xyz, mesh_degree, ncomp_x, num_q_comp, num_qpts, create_qdata=False)
+    solution_restriction, sol_size, q_data_restriction, num_elem, elem_qpts = common.build_cartesian_restriction(
+        ceed, dim, num_xyz, sol_degree, 1, num_q_comp, num_qpts, create_qdata=True)
+
+    if not args.quiet:
+        print("Number of mesh nodes        : %d" % (mesh_size // dim))
+        print("Number of solution nodes    : %d" % sol_size)
+
+    # Create and transform mesh coordinates
+    mesh_coords = ceed.Vector(mesh_size)
+    common.set_cartesian_mesh_coords(ceed, dim, num_xyz, mesh_degree, mesh_coords)
+    exact_volume, _ = common.transform_mesh_coords(dim, mesh_size, mesh_coords)
+
+    # Create the QFunction that builds the mass operator (i.e. computes its quadrature data) and set its context data
+    qf_build = None
+    if args.gallery:
+        qf_build = ceed.QFunctionByName(f"Mass{dim}DBuild")
+    else:
+        build_ctx = ceed.QFunctionContext()
+        ctx_data = np.array([dim, dim], dtype=np.int32)
+        build_ctx.set_data(ctx_data)
+
+        qfs_so = common.load_qfs_so()
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+
+        qf_build = ceed.QFunction(1, qfs_so.build_mass,
+                                  os.path.join(file_dir, "ex1-volume.h:build_mass"))
+        qf_build.add_input("dx", dim * dim, libceed.EVAL_GRAD)
+        qf_build.add_input("weights", 1, libceed.EVAL_WEIGHT)
+        qf_build.add_output("qdata", num_q_comp, libceed.EVAL_NONE)
+        qf_build.set_context(build_ctx)
+
+    # Create the operator that builds the quadrature data for the mass operator
+    op_build = ceed.Operator(qf_build)
+    op_build.set_field("dx", mesh_restriction, mesh_basis, libceed.VECTOR_ACTIVE)
+    op_build.set_field("weights", libceed.ELEMRESTRICTION_NONE, mesh_basis, libceed.VECTOR_NONE)
+    op_build.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, libceed.VECTOR_ACTIVE)
+
+    # Compute the quadrature data for the mass operator
+    q_data = ceed.Vector(num_elem * elem_qpts * num_q_comp)
+    op_build.apply(mesh_coords, q_data)
+
+    # Setup QFunction for applying the mass operator
+    qf_mass = None
+    if args.gallery:
+        qf_mass = ceed.QFunctionByName("MassApply")
+    else:
+        build_ctx = ceed.QFunctionContext()
+        ctx_data = np.array([dim, dim], dtype=np.int32)
+        build_ctx.set_data(ctx_data)
+
+        qfs_so = common.load_qfs_so()
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+
+        qf_mass = ceed.QFunction(1, qfs_so.apply_mass,
+                                 os.path.join(file_dir, "ex1-volume.h:apply_mass"))
+        qf_mass.add_input("u", 1, libceed.EVAL_INTERP)
+        qf_mass.add_input("qdata", num_q_comp, libceed.EVAL_NONE)
+        qf_mass.add_output("v", 1, libceed.EVAL_INTERP)
+        qf_mass.set_context(build_ctx)
+
+    # Create the mass operator
+    op_mass = ceed.Operator(qf_mass)
+    op_mass.set_field("u", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+    op_mass.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, q_data)
+    op_mass.set_field("v", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+
+    # Create solution vectors
+    u = ceed.Vector(sol_size)
+    v = ceed.Vector(sol_size)
+    u.set_value(1.0)  # Set all entries of u to 1.0
+
+    # Apply mass operator: v = M * u
+    op_mass.apply(u, v)
+
+    # Compute volume by summing all entries in v
+    volume = 0.0
+    with v.array_read() as v_array:
+        # Simply sum all values to compute the volume
+        volume = np.sum(v_array)
+
+    if not args.test:
+        print()
+        print(f"Exact mesh volume    : {exact_volume:.14g}")
+        print(f"Computed mesh volume : {volume:.14g}")
+        print(f"Volume error         : {volume - exact_volume:.14g}")
+    else:
+        # Test mode - check if error is within tolerance
+        tol = 200 * libceed.EPSILON if dim == 1 else 1e-5
+        if abs(volume - exact_volume) > tol:
+            print(f"Volume error : {volume - exact_volume:.14g}")
+            sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/python/ex2_surface.py b/examples/python/ex2_surface.py
new file mode 100644
index 0000000000..f741600110
--- /dev/null
+++ b/examples/python/ex2_surface.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+#
+# libCEED example using diffusion operator to compute surface area
+#
+# Sample runs:
+#
+#     python ex2_surface.py
+#     python ex2_surface.py -c /cpu/self
+#     python ex2_surface.py -c /gpu/cuda
+
+import sys
+import os
+import numpy as np
+import libceed
+import ex_common as common
+
+
+def main():
+    """Main driver for surface area example"""
+    args = common.parse_arguments()
+    return example_2(args)
+
+
+def example_2(options):
+    """Compute surface area using diffusion operator
+
+    Args:
+        args: Parsed command line arguments
+
+    Returns:
+        int: 0 on success, error code on failure
+    """
+    # Process arguments
+    args = options
+    dim = args.dim
+    mesh_degree = max(args.mesh_degree, args.solution_degree)
+    sol_degree = args.solution_degree
+    num_qpts = args.quadrature_points
+    problem_size = args.problem_size if args.problem_size > 0 else (500 * dim * dim if args.test else 256 * 1024)
+    ncomp_x = dim  # Number of coordinate components
+
+    # Print configuration
+    if not args.quiet:
+        print("Selected options: [command line option] : <current value>")
+        print(f"    Ceed specification [-c] : {args.ceed}")
+        print(f"    Mesh dimension     [-d] : {dim}")
+        print(f"    Mesh degree        [-m] : {mesh_degree}")
+        print(f"    Solution degree    [-p] : {sol_degree}")
+        print(f"    Num. 1D quadr. pts [-q] : {num_qpts}")
+        print(f"    Approx. # unknowns [-s] : {problem_size}")
+        print(f"    QFunction source   [-g] : {'gallery' if args.gallery else 'user'}")
+
+    # Initialize CEED
+    ceed = libceed.Ceed(args.ceed)
+
+    # Create bases
+    # Tensor-product Lagrange basis for mesh coordinates
+    mesh_basis = ceed.BasisTensorH1Lagrange(
+        dim, ncomp_x, mesh_degree + 1, num_qpts, libceed.GAUSS)
+
+    # Tensor-product Lagrange basis for solution
+    solution_basis = ceed.BasisTensorH1Lagrange(
+        dim, 1, sol_degree + 1, num_qpts, libceed.GAUSS)
+
+    # Create mesh
+    # Determine mesh size
+    num_xyz = common.get_cartesian_mesh_size(dim, sol_degree, problem_size)
+    if not args.quiet:
+        print("\nMesh size                   : nx = %d" % num_xyz[0], end="")
+        if dim > 1:
+            print(", ny = %d" % num_xyz[1], end="")
+        if dim > 2:
+            print(", nz = %d" % num_xyz[2], end="")
+        print()
+
+    # Create element restrictions
+    num_q_comp = dim * (dim + 1) // 2
+    mesh_restriction, mesh_size, _, _, _ = common.build_cartesian_restriction(
+        ceed, dim, num_xyz, mesh_degree, ncomp_x, num_q_comp, num_qpts, create_qdata=False)
+    solution_restriction, sol_size, q_data_restriction, num_elem, elem_qpts = common.build_cartesian_restriction(
+        ceed, dim, num_xyz, sol_degree, 1, num_q_comp, num_qpts, create_qdata=True)
+
+    if not args.quiet:
+        print("Number of mesh nodes        : %d" % (mesh_size // dim))
+        print("Number of solution nodes    : %d" % sol_size)
+
+    # Create and transform mesh coordinates
+    mesh_coords = ceed.Vector(mesh_size)
+    common.set_cartesian_mesh_coords(ceed, dim, num_xyz, mesh_degree, mesh_coords)
+    _, exact_surface_area = common.transform_mesh_coords(dim, mesh_size, mesh_coords, use_sin=False)
+
+    # Create the QFunction that builds the diffusion operator (i.e. computes
+    # its quadrature data) and set its context data
+    qf_build = None
+    if args.gallery:
+        qf_build = ceed.QFunctionByName(f"Poisson{dim}DBuild")
+    else:
+        build_ctx = ceed.QFunctionContext()
+        ctx_data = np.array([dim, dim], dtype=np.int32)
+        build_ctx.set_data(ctx_data)
+
+        qfs_so = common.load_qfs_so()
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+
+        qf_build = ceed.QFunction(1, qfs_so.build_diff,
+                                  os.path.join(file_dir, "ex2-surface.h:build_diff"))
+        qf_build.add_input("dx", dim * dim, libceed.EVAL_GRAD)
+        qf_build.add_input("weights", 1, libceed.EVAL_WEIGHT)
+        qf_build.add_output("qdata", num_q_comp, libceed.EVAL_NONE)
+        qf_build.set_context(build_ctx)
+
+    # Operator for building quadrature data
+    op_build = ceed.Operator(qf_build)
+    op_build.set_field("dx", mesh_restriction, mesh_basis, libceed.VECTOR_ACTIVE)
+    op_build.set_field("weights", libceed.ELEMRESTRICTION_NONE, mesh_basis, libceed.VECTOR_NONE)
+    op_build.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, libceed.VECTOR_ACTIVE)
+
+    # Compute quadrature data
+    q_data = ceed.Vector(num_elem * elem_qpts * num_q_comp)
+    op_build.apply(mesh_coords, q_data)
+
+    # Create the QFunction that defines the action of the diffusion operator
+    qf_diff = None
+    if args.gallery:
+        qf_diff = ceed.QFunctionByName(f"Poisson{dim}DApply")
+    else:
+        build_ctx = ceed.QFunctionContext()
+        ctx_data = np.array([dim, dim], dtype=np.int32)
+        build_ctx.set_data(ctx_data)
+
+        qfs_so = common.load_qfs_so()
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+
+        qf_diff = ceed.QFunction(1, qfs_so.apply_diff,
+                                 os.path.join(file_dir, "ex2-surface.h:apply_diff"))
+        qf_diff.add_input("du", dim, libceed.EVAL_GRAD)
+        qf_diff.add_input("qdata", num_q_comp, libceed.EVAL_NONE)
+        qf_diff.add_output("dv", dim, libceed.EVAL_GRAD)
+        qf_diff.set_context(build_ctx)
+
+    # Diffusion operator
+    op_diff = ceed.Operator(qf_diff)
+    op_diff.set_field("du", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+    op_diff.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, q_data)
+    op_diff.set_field("dv", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+
+    # Create vectors
+    u = ceed.Vector(sol_size)  # Input vector
+    v = ceed.Vector(sol_size)  # Output vector
+
+    # Initialize u with sum of coordinates (x + y + z)
+    with mesh_coords.array_read() as x_array, u.array_write() as u_array:
+        for i in range(sol_size):
+            u_array[i] = sum(x_array[i + j * (sol_size)] for j in range(dim))
+
+    # Apply operator: v = K * u
+    op_diff.apply(u, v)
+
+    # Compute surface area by summing absolute values of v
+    surface_area = 0.0
+    with v.array_read() as v_array:
+        surface_area = np.sum(abs(v_array))
+
+    if not args.test:
+        print()
+        print(f"Exact mesh surface area    : {exact_surface_area:.14g}")
+        print(f"Computed mesh surface area : {surface_area:.14g}")
+        print(f"Surface area error         : {surface_area - exact_surface_area:.14g}")
+    else:
+        # Test mode - check if error is within tolerance
+        tol = 10000 * libceed.EPSILON if dim == 1 else 1e-1
+        if abs(surface_area - exact_surface_area) > tol:
+            print(f"Surface area error : {surface_area - exact_surface_area:.14g}")
+            sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/python/ex3_volume.py b/examples/python/ex3_volume.py
new file mode 100644
index 0000000000..7fe6df7387
--- /dev/null
+++ b/examples/python/ex3_volume.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+#
+# libCEED example using diffusion operator to compute surface area
+#
+# Sample runs:
+#
+#     python ex3_volume.py
+#     python ex3_volume -c /cpu/self
+#     python ex3_volume -c /gpu/cuda
+
+import sys
+import os
+import numpy as np
+import libceed
+import ex_common as common
+
+
+def main():
+    """Main function for volume example"""
+    args = common.parse_arguments()
+    example_3(args)
+
+
+def example_3(args):
+    """Compute volume using mass and diff operator
+
+    Args:
+        args: Parsed command line arguments
+
+    Returns:
+        int: 0 on success, error code on failure
+    """
+    # Process arguments
+    dim = args.dim
+    mesh_degree = max(args.mesh_degree, args.solution_degree)
+    sol_degree = args.solution_degree
+    num_qpts = args.quadrature_points
+    problem_size = args.problem_size if args.problem_size > 0 else (8 * 16 if args.test else 256 * 1024)
+    ncomp_x = dim  # Number of coordinate components
+
+    # Print configuration
+    if not args.quiet:
+        print("Selected options: [command line option] : <current value>")
+        print(f"    Ceed specification [-c] : {args.ceed}")
+        print(f"    Mesh dimension     [-d] : {dim}")
+        print(f"    Mesh degree        [-m] : {mesh_degree}")
+        print(f"    Solution degree    [-p] : {sol_degree}")
+        print(f"    Num. 1D quadr. pts [-q] : {num_qpts}")
+        print(f"    Approx. # unknowns [-s] : {problem_size}")
+        print(f"    QFunction source   [-g] : {'gallery' if args.gallery else 'user'}")
+
+    # Check - Gallery not supported
+    if args.gallery:
+        print("Gallery QFunction not supported for example 3")
+        sys.exit(1)
+
+    # Initialize CEED
+    ceed = libceed.Ceed(args.ceed)
+
+    # Create bases
+    # Tensor-product Lagrange basis for mesh coordinates
+    mesh_basis = ceed.BasisTensorH1Lagrange(
+        dim, ncomp_x, mesh_degree + 1, num_qpts, libceed.GAUSS)
+
+    # Tensor-product Lagrange basis for solution
+    solution_basis = ceed.BasisTensorH1Lagrange(
+        dim, 1, sol_degree + 1, num_qpts, libceed.GAUSS)
+
+    # Create mesh
+    # Determine mesh size
+    num_xyz = common.get_cartesian_mesh_size(dim, sol_degree, problem_size)
+    if not args.quiet:
+        print("\nMesh size                   : nx = %d" % num_xyz[0], end="")
+        if dim > 1:
+            print(", ny = %d" % num_xyz[1], end="")
+        if dim > 2:
+            print(", nz = %d" % num_xyz[2], end="")
+        print()
+
+    # Create element restrictions
+    num_q_comp = 1 + dim * (dim + 1) // 2
+    mesh_restriction, mesh_size, _, _, _ = common.build_cartesian_restriction(
+        ceed, dim, num_xyz, mesh_degree, ncomp_x, num_q_comp, num_qpts, create_qdata=False)
+    solution_restriction, sol_size, q_data_restriction, num_elem, elem_qpts = common.build_cartesian_restriction(
+        ceed, dim, num_xyz, sol_degree, 1, num_q_comp, num_qpts, create_qdata=True)
+
+    if not args.quiet:
+        print("Number of mesh nodes        : %d" % (mesh_size // dim))
+        print("Number of solution nodes    : %d" % sol_size)
+
+    # Create and transform mesh coordinates
+    mesh_coords = ceed.Vector(mesh_size)
+    common.set_cartesian_mesh_coords(ceed, dim, num_xyz, mesh_degree, mesh_coords)
+    exact_volume, _ = common.transform_mesh_coords(dim, mesh_size, mesh_coords)
+
+    # Create QFunction context
+    build_ctx = ceed.QFunctionContext()
+    ctx_data = np.array([dim, dim], dtype=np.int32)
+    build_ctx.set_data(ctx_data)
+
+    # Load QFunctions
+    qfs_so = common.load_qfs_so()
+    file_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # Create the QFunction that builds the mass + diffusion operator (i.e.
+    # computes its quadrature data) and set its context data
+    qf_build = ceed.QFunction(1, qfs_so.build_mass_diff,
+                              os.path.join(file_dir, "ex3-volume.h:build_mass_diff"))
+    qf_build.add_input("dx", dim * dim, libceed.EVAL_GRAD)
+    qf_build.add_input("weights", 1, libceed.EVAL_WEIGHT)
+    qf_build.add_output("qdata", num_q_comp, libceed.EVAL_NONE)
+    qf_build.set_context(build_ctx)
+
+    # Create the operator that builds the quadrature data for the mass + diffusion operator
+    op_build = ceed.Operator(qf_build)
+    op_build.set_field("dx", mesh_restriction, mesh_basis, libceed.VECTOR_ACTIVE)
+    op_build.set_field("weights", libceed.ELEMRESTRICTION_NONE, mesh_basis, libceed.VECTOR_NONE)
+    op_build.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, libceed.VECTOR_ACTIVE)
+
+    # Compute the quadrature data for the mass + diffusion operator
+    q_data = ceed.Vector(num_elem * elem_qpts * num_q_comp)
+    op_build.apply(mesh_coords, q_data)
+
+    # Create the QFunction that defines the action of the mass + diffusion operator
+    qf_apply = ceed.QFunction(1, qfs_so.apply_mass_diff,
+                              os.path.join(file_dir, "ex3-volume.h:apply_mass_diff"))
+    qf_apply.add_input("u", 1, libceed.EVAL_INTERP)
+    qf_apply.add_input("du", dim, libceed.EVAL_GRAD)
+    qf_apply.add_input("qdata", num_q_comp, libceed.EVAL_NONE)
+    qf_apply.add_output("v", 1, libceed.EVAL_INTERP)
+    qf_apply.add_output("dv", dim, libceed.EVAL_GRAD)
+    qf_apply.set_context(build_ctx)
+
+    # Create the mass + diffusion operator
+    op_apply = ceed.Operator(qf_apply)
+    op_apply.set_field("u", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+    op_apply.set_field("du", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+    op_apply.set_field("qdata", q_data_restriction, libceed.BASIS_NONE, q_data)
+    op_apply.set_field("v", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+    op_apply.set_field("dv", solution_restriction, solution_basis, libceed.VECTOR_ACTIVE)
+
+    # Create solution vectors
+    u = ceed.Vector(sol_size)
+    v = ceed.Vector(sol_size)
+    u.set_value(1.0)  # Set all entries of u to 1.0
+
+    # Apply mass + diffusion operator: v = (M + K) * u
+    op_apply.apply(u, v)
+
+    # Compute volume by summing all entries in v
+    volume = 0.0
+    with v.array_read() as v_array:
+        # Simply sum all values to compute the volume
+        volume = np.sum(v_array)
+
+    if not args.test:
+        print()
+        print(f"Exact mesh volume    : {exact_volume:.14g}")
+        print(f"Computed mesh volume : {volume:.14g}")
+        print(f"Volume error         : {volume - exact_volume:.14g}")
+    else:
+        # Test mode - check if error is within tolerance
+        tol = 200 * libceed.EPSILON if dim == 1 else 1e-5
+        if abs(volume - exact_volume) > tol:
+            print(f"Volume error : {volume - exact_volume:.14g}")
+            sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/python/ex_common.py b/examples/python/ex_common.py
new file mode 100644
index 0000000000..00e75805fb
--- /dev/null
+++ b/examples/python/ex_common.py
@@ -0,0 +1,255 @@
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+
+import sys
+import os
+from sysconfig import get_config_var
+import argparse
+import math
+import numpy as np
+import libceed
+import ctypes
+
+
+def parse_arguments():
+    """Parse command line arguments for surface area computation
+
+    Returns:
+        Namespace: Parsed arguments with fields:
+            ceed: CEED resource specifier
+            dim: Problem dimension (1-3)
+            mesh_degree: Mesh polynomial degree
+            solution_degree: Solution polynomial degree
+            num_qpts: Number of quadrature points
+            problem_size: Approximate problem size
+            test: Test mode flag
+            quiet: Suppress output flag
+            gallery: Use gallery QFunctions flag
+    """
+    parser = argparse.ArgumentParser(description="libCEED surface area example")
+    parser.add_argument("-c", "--ceed", default="/cpu/self",
+                        help="libCEED resource specifier (default: /cpu/self)")
+    parser.add_argument("-d", "--dim", type=int, default=3,
+                        help="Problem dimension (1-3) (default: 3)")
+    parser.add_argument("-m", "--mesh-degree", type=int, default=4,
+                        help="Mesh polynomial degree (default: 4)")
+    parser.add_argument("-p", "--solution-degree", type=int, default=4,
+                        help="Solution polynomial degree (default: 4)")
+    parser.add_argument("-q", "--quadrature-points", type=int, default=6,
+                        help="Number of quadrature points (default: 6)")
+    parser.add_argument("-s", "--problem-size", type=int, default=-1,
+                        help="Approximate problem size (default: ~256k)")
+    parser.add_argument("-t", "--test", action="store_true",
+                        help="Test mode (reduced problem size)")
+    parser.add_argument("--quiet", action="store_true",
+                        help="Suppress output")
+    parser.add_argument("-g", "--gallery", action="store_true",
+                        help="Use gallery QFunctions")
+
+    args = parser.parse_args()
+    if args.dim not in [1, 2, 3]:
+        parser.error("Dimension must be 1, 2, or 3")
+    return args
+
+
+def get_cartesian_mesh_size(dim, degree, prob_size):
+    """Determine Cartesian mesh size for given problem size
+
+    Args:
+        dim: Spatial dimension (1-3)
+        degree: Polynomial degree
+        prob_size: Target problem size
+
+    Returns:
+        list: Number of elements in each dimension
+    """
+    # Calculate number of elements needed
+    num_elem = prob_size // (degree ** dim)
+
+    # Find smallest power of 2 >= num_elem
+    s = 0
+    while num_elem > 1:
+        num_elem = num_elem / 2
+        s += 1
+
+    # Distribute across dimensions
+    r = s % dim
+    num_xyz = []
+    for d in range(dim):
+        sd = s // dim
+        if r > 0:
+            sd += 1
+            r -= 1
+        num_xyz.append(1 << sd)
+    return num_xyz
+
+
+def build_cartesian_restriction(ceed, dim, num_xyz, degree, num_comp, num_q_comp, num_qpts, create_qdata=False):
+    """Build element restriction for Cartesian grid
+
+    Args:
+        ceed: libCEED context
+        dim: Spatial dimension
+        num_xyz: Elements per dimension
+        degree: Polynomial degree
+        num_comp: Number of components
+        num_q_comp: Number of quadrature data components
+        num_qpts: Quadrature points per dimension
+        build_qdata: Flag to build restriction for quadrature data
+
+    Returns:
+        tuple: (elem_restriction, size, q_data_restriction, num_elem, elem_qpts)
+    """
+    p = degree + 1  # Nodes per element per dimension
+    num_nodes = p ** dim
+    elem_qpts = num_qpts ** dim
+
+    # Calculate grid parameters
+    nd = []
+    num_elem = 1
+    scalar_size = 1
+    for d in range(dim):
+        num_elem *= num_xyz[d]
+        nd.append(num_xyz[d] * (p - 1) + 1)  # Nodes per dimension
+        scalar_size *= nd[d]
+
+    size = scalar_size * num_comp
+
+    # Create element connectivity
+    elem_nodes = np.zeros(num_elem * num_nodes, dtype=np.int32)
+    for e in range(num_elem):
+        # Get element coordinates
+        e_xyz = [0] * dim
+        re = e
+        for d in range(dim):
+            e_xyz[d] = re % num_xyz[d]
+            re //= num_xyz[d]
+
+        # Calculate global node numbers
+        for n in range(num_nodes):
+            g_node = 0
+            g_stride = 1
+            r_node = n
+            for d in range(dim):
+                g_node += (e_xyz[d] * (p - 1) + r_node % p) * g_stride
+                g_stride *= nd[d]
+                r_node //= p
+            elem_nodes[e * num_nodes + n] = g_node
+
+    # Create restrictions
+    elem_restriction = ceed.ElemRestriction(
+        num_elem, num_nodes, num_comp, scalar_size, size, elem_nodes)
+
+    q_data_restriction = None
+    if create_qdata:
+        strides = np.array([1, elem_qpts, elem_qpts * num_q_comp], dtype=np.int32)
+        q_data_restriction = ceed.StridedElemRestriction(
+            num_elem, elem_qpts, num_q_comp, num_elem * elem_qpts * num_q_comp, strides)
+
+    return elem_restriction, size, q_data_restriction, num_elem, elem_qpts
+
+
+def set_cartesian_mesh_coords(ceed, dim, num_xyz, mesh_degree, mesh_coords):
+    """Create Cartesian mesh coordinates
+
+    Args:
+        ceed: libCEED context
+        dim: Spatial dimension
+        num_xyz: Elements per dimension
+        mesh_degree: Mesh polynomial degree
+        mesh_coords: CeedVector to hold mesh coordinates
+
+    Returns:
+        Vector: Mesh coordinates
+    """
+    p = mesh_degree + 1
+    nd = []
+    scalar_size = 1
+    for d in range(dim):
+        nd.append(num_xyz[d] * (p - 1) + 1)
+        scalar_size *= nd[d]
+
+    # Get Lobatto nodes (quadrature points)
+    nodes, _ = ceed.lobatto_quadrature(p)
+    nodes = 0.5 + 0.5 * nodes  # Map from [-1,1] to [0,1]
+
+    # Create coordinates
+    coords = np.zeros(scalar_size * dim)
+    for gs_node in range(scalar_size):
+        r_node = gs_node
+        for d in range(dim):
+            d_1d = r_node % nd[d]
+            elem_id = d_1d // (p - 1)
+            node_id = d_1d % (p - 1)
+            coords[gs_node + scalar_size * d] = (elem_id + nodes[node_id]) / num_xyz[d]
+            r_node //= nd[d]
+
+    mesh_coords.set_array(coords, cmode=libceed.COPY_VALUES)
+    return mesh_coords
+
+
+def transform_mesh_coords(dim, mesh_size, mesh_coords, use_sin=True):
+    """Transform mesh coordinates and return exact surface area
+
+    Args:
+        dim: Spatial dimension
+        mesh_size: Total mesh vector size
+        mesh_coords: Mesh coordinates vector
+        use_sin: Use sinusoidal transformation
+
+    Returns:
+        float: Tuple with exact volume and surface area for transformed mesh
+    """
+    exact_volume = {1: 1.0, 2: 3. / 4. * np.pi, 3: 3. / 4. * np.pi}[dim]
+    exact_area = {1: 2.0, 2: 4.0, 3: 6.0}[dim]
+
+    # Apply transformation to coordinates
+    num_nodes = mesh_size // dim
+    with mesh_coords.array_write() as coords:
+        if dim == 1:
+            for i in range(num_nodes):
+                x = coords[i] - 0.5
+                coords[i] = 0.5 + (1.0 / np.sqrt(3.0)) * np.sin((2.0 / 3.0) * np.pi * x)
+        else:
+            if use_sin:
+                for i in range(num_nodes):
+                    u = 1. + coords[i]
+                    v = np.pi / 2. * coords[i + num_nodes]
+                    coords[i] = u * np.cos(v)
+                    coords[i + num_nodes] = u * np.sin(v)
+            else:
+                for i in range(num_nodes):
+                    x = coords[i] - 0.5
+                    coords[i] = 0.5 + (1.0 / np.sqrt(3.0)) * np.sin((2.0 / 3.0) * np.pi * x)
+
+    return (exact_volume, exact_area)
+
+
+def find_qfs_so(name, path):
+    """Find the QFunctions shared library.
+    Returns:
+        Filepath to shared library object
+    """
+    for root, dirs, files in os.walk(path):
+        if name in files:
+            return os.path.join(root, name)
+
+
+def load_qfs_so():
+    """Load the QFunctions shared library.
+    Returns:
+        Loaded shared library object
+    """
+    file_dir = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        "build")
+    qfs_so = find_qfs_so(
+        "libceed_c_qfunctions" + get_config_var("EXT_SUFFIX"),
+        file_dir)
+
+    # Load library
+    return ctypes.cdll.LoadLibrary(qfs_so)
diff --git a/examples/python/ex_test.py b/examples/python/ex_test.py
new file mode 100644
index 0000000000..4d9cbf1e6a
--- /dev/null
+++ b/examples/python/ex_test.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+
+import pytest
+from argparse import Namespace
+import ex1_volume
+import ex2_surface
+import ex3_volume
+
+# -------------------------------------------------------------------------------
+
+
+def test_101(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=1,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex1_volume.example_1(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_101g(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=1,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=True,
+    )
+    ex1_volume.example_1(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_102(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=2,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex1_volume.example_1(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_102g(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=2,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=True,
+    )
+    ex1_volume.example_1(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_103(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=3,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex1_volume.example_1(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_103g(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=3,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=True,
+    )
+    ex1_volume.example_1(args)
+
+
+# -------------------------------------------------------------------------------
+def test_201(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=1,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex2_surface.example_2(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_201g(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=1,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=True,
+    )
+    ex2_surface.example_2(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_202(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=2,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex2_surface.example_2(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_202g(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=2,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=True,
+    )
+    ex2_surface.example_2(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_203(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=3,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex2_surface.example_2(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_203g(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=3,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=True,
+    )
+    ex2_surface.example_2(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_301(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=1,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex3_volume.example_3(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_302(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=2,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex3_volume.example_3(args)
+
+# -------------------------------------------------------------------------------
+
+
+def test_303(ceed_resource):
+    args = Namespace(
+        ceed=ceed_resource,
+        dim=3,
+        mesh_degree=4,
+        solution_degree=4,
+        quadrature_points=6,
+        problem_size=-1,
+        test=True,
+        quiet=True,
+        gallery=False,
+    )
+    ex3_volume.example_3(args)
+
+# -------------------------------------------------------------------------------
diff --git a/backends/occa/kernels/kernel-defines.hpp b/examples/python/qfunctions/ex-common.h
similarity index 51%
rename from backends/occa/kernels/kernel-defines.hpp
rename to examples/python/qfunctions/ex-common.h
index beb0c79624..32b867b67f 100644
--- a/backends/occa/kernels/kernel-defines.hpp
+++ b/examples/python/qfunctions/ex-common.h
@@ -1,13 +1,14 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
+#pragma once
 
-#ifndef CEED_OCCA_KERNELS_KERNELDEFINES_HEADER
-#define CEED_OCCA_KERNELS_KERNELDEFINES_HEADER
+#include <ceed/types.h>
 
-#define STRINGIFY_SOURCE(...) #__VA_ARGS__
-
-#endif
+/// A structure used to pass additional data
+struct BuildContext {
+  CeedInt dim, space_dim;
+};
diff --git a/examples/python/qfunctions/ex1-volume.h b/examples/python/qfunctions/ex1-volume.h
new file mode 100644
index 0000000000..907b77bf5c
--- /dev/null
+++ b/examples/python/qfunctions/ex1-volume.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <ceed/types.h>
+#include "ex-common.h"
+
+/// libCEED Q-function for building quadrature data for a mass operator
+CEED_QFUNCTION(build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0] is Jacobians with shape [dim, dim, Q]
+  // in[1] is quadrature weights with shape [1, Q]
+  const CeedScalar    *w          = in[1];
+  CeedScalar          *q_data     = out[0];
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
+  switch (build_data->dim + 10 * build_data->space_dim) {
+    case 11: {
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
+      // Quadrature Point Loop
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[0][0][i] * w[i]; }  // End of Quadrature Point Loop
+    } break;
+    case 22: {
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
+      // Quadrature Point Loop
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        q_data[i] = (J[0][0][i] * J[1][1][i] - J[0][1][i] * J[1][0][i]) * w[i];
+      }  // End of Quadrature Point Loop
+    } break;
+    case 33: {
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
+      // Quadrature Point Loop
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        q_data[i] =
+            (J[0][0][i] * (J[1][1][i] * J[2][2][i] - J[1][2][i] * J[2][1][i]) - J[0][1][i] * (J[1][0][i] * J[2][2][i] - J[1][2][i] * J[2][0][i]) +
+             J[0][2][i] * (J[1][0][i] * J[2][1][i] - J[1][1][i] * J[2][0][i])) *
+            w[i];
+      }  // End of Quadrature Point Loop
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/// libCEED Q-function for applying a mass operator
+CEED_QFUNCTION(apply_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0], out[0] are solution variables with shape [1, Q]
+  // in[1] is quadrature data with shape [1, Q]
+  const CeedScalar *u = in[0], *q_data = in[1];
+  CeedScalar       *v = out[0];
+
+  // Quadrature Point Loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = q_data[i] * u[i]; }  // End of Quadrature Point Loop
+  return CEED_ERROR_SUCCESS;
+}
diff --git a/examples/python/qfunctions/ex2-surface.h b/examples/python/qfunctions/ex2-surface.h
new file mode 100644
index 0000000000..980a952105
--- /dev/null
+++ b/examples/python/qfunctions/ex2-surface.h
@@ -0,0 +1,135 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <ceed/types.h>
+#include "ex-common.h"
+
+/// libCEED Q-function for building quadrature data for a diffusion operator
+CEED_QFUNCTION(build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0] is Jacobians with shape [dim, dim, Q]
+  // in[1] is quadrature weights, size (Q)
+  const CeedScalar *w             = in[1];
+  CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
+  // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
+  // the symmetric part of the result.
+  switch (build_data->dim + 10 * build_data->space_dim) {
+    case 11: {
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[0][i] = w[i] / J[0][0][i]; }  // End of Quadrature Point Loop
+    } break;
+    case 22: {
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // J: 0 2   q_data: 0 2   adj(J):  J11 -J01
+        //    1 3           2 1           -J10  J00
+        const CeedScalar J00 = J[0][0][i];
+        const CeedScalar J10 = J[0][1][i];
+        const CeedScalar J01 = J[1][0][i];
+        const CeedScalar J11 = J[1][1][i];
+        const CeedScalar qw  = w[i] / (J00 * J11 - J10 * J01);
+
+        q_data[0][i] = qw * (J01 * J01 + J11 * J11);
+        q_data[1][i] = qw * (J00 * J00 + J10 * J10);
+        q_data[2][i] = -qw * (J00 * J01 + J10 * J11);
+      }  // End of Quadrature Point Loop
+    } break;
+    case 33: {
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Compute the adjoint
+        CeedScalar A[3][3];
+
+        for (CeedInt j = 0; j < 3; j++) {
+          for (CeedInt k = 0; k < 3; k++) {
+            // Equivalent code with J as a VLA and no mod operations:
+            // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1]
+            A[k][j] =
+                J[(k + 1) % 3][(j + 1) % 3][i] * J[(k + 2) % 3][(j + 2) % 3][i] - J[(k + 2) % 3][(j + 1) % 3][i] * J[(k + 1) % 3][(j + 2) % 3][i];
+          }
+        }
+
+        // Compute quadrature weight / det(J)
+        const CeedScalar qw = w[i] / (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]);
+
+        // Compute geometric factors
+        // Stored in Voigt convention
+        // 0 5 4
+        // 5 1 3
+        // 4 3 2
+        q_data[0][i] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
+        q_data[1][i] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
+        q_data[2][i] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
+        q_data[3][i] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
+        q_data[4][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
+        q_data[5][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
+      }  // End of Quadrature Point Loop
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/// libCEED Q-function for applying a diff operator
+CEED_QFUNCTION(apply_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+  // in[0], out[0] solution gradients with shape [dim, 1, Q]
+  // in[1] is quadrature data with shape [num_components, Q]
+  const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
+
+  switch (build_data->dim) {
+    case 1: {
+      const CeedScalar *ug = in[0];
+      CeedScalar       *vg = out[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { vg[i] = ug[i] * q_data[0][i]; }  // End of Quadrature Point Loop
+    } break;
+    case 2: {
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Read q_data (dXdxdXdx_T symmetric matrix)
+        // Stored in Voigt convention
+        // 0 2
+        // 2 1
+        const CeedScalar dXdxdXdx_T[2][2] = {
+            {q_data[0][i], q_data[2][i]},
+            {q_data[2][i], q_data[1][i]}
+        };
+
+        // j = direction of vg
+        for (int j = 0; j < 2; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j]);
+      }  // End of Quadrature Point Loop
+    } break;
+    case 3: {
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Read q_data (dXdxdXdx_T symmetric matrix)
+        // Stored in Voigt convention
+        // 0 5 4
+        // 5 1 3
+        // 4 3 2
+        const CeedScalar dXdxdXdx_T[3][3] = {
+            {q_data[0][i], q_data[5][i], q_data[4][i]},
+            {q_data[5][i], q_data[1][i], q_data[3][i]},
+            {q_data[4][i], q_data[3][i], q_data[2][i]}
+        };
+
+        // j = direction of vg
+        for (int j = 0; j < 3; j++) vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j] + ug[2][i] * dXdxdXdx_T[2][j]);
+      }  // End of Quadrature Point Loop
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
diff --git a/examples/python/qfunctions/ex3-volume.h b/examples/python/qfunctions/ex3-volume.h
new file mode 100644
index 0000000000..1a992480cc
--- /dev/null
+++ b/examples/python/qfunctions/ex3-volume.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#pragma once
+
+#include <ceed.h>
+#include "ex-common.h"
+
+/// libCEED Q-function for building quadrature data for a mass + diffusion operator
+CEED_QFUNCTION(build_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0] is Jacobians with shape [dim, dim, Q]
+  // in[1] is quadrature weights, size (Q)
+  const CeedScalar *w             = in[1];
+  CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+
+  // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
+  // the symmetric part of the result.
+  switch (build_data->dim + 10 * build_data->space_dim) {
+    case 11: {  // dim = 1, space_dim = 1
+      const CeedScalar(*J)[1][CEED_Q_VLA] = (const CeedScalar(*)[1][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        q_data[0][i] = w[i] * J[0][0][i];
+
+        // Diffusion
+        q_data[1][i] = w[i] / J[0][0][i];
+      }
+    } break;
+    case 22: {  // dim = 2, space_dim = 2
+      const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // J: 0 2   q_data: 0 2   adj(J):  J22 -J12
+        //    1 3           2 1           -J10  J00
+        const CeedScalar J00 = J[0][0][i];
+        const CeedScalar J10 = J[0][1][i];
+        const CeedScalar J01 = J[1][0][i];
+        const CeedScalar J11 = J[1][1][i];
+        const CeedScalar qw  = w[i] / (J00 * J11 - J10 * J01);
+
+        // Mass
+        q_data[0][i] = w[i] * (J00 * J11 - J10 * J01);
+
+        // Diffusion
+        q_data[1][i] = qw * (J01 * J01 + J11 * J11);
+        q_data[2][i] = qw * (J00 * J00 + J10 * J10);
+        q_data[3][i] = -qw * (J00 * J01 + J10 * J11);
+      }
+    } break;
+    case 33: {  // dim = 3, space_dim = 3
+      const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Compute the adjoint
+        CeedScalar A[3][3];
+        for (CeedInt j = 0; j < 3; j++) {
+          for (CeedInt k = 0; k < 3; k++) {
+            A[k][j] =
+                J[(k + 1) % 3][(j + 1) % 3][i] * J[(k + 2) % 3][(j + 2) % 3][i] - J[(k + 2) % 3][(j + 1) % 3][i] * J[(k + 1) % 3][(j + 2) % 3][i];
+          }
+        }
+
+        // Compute quadrature weight / det(J)
+        const CeedScalar qw = w[i] / (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]);
+
+        // Mass
+        q_data[0][i] = w[i] * (J[0][0][i] * A[0][0] + J[0][1][i] * A[0][1] + J[0][2][i] * A[0][2]);
+
+        // Diffusion
+        // Stored in Voigt convention
+        // 1 6 5
+        // 6 2 4
+        // 5 4 3
+        q_data[1][i] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]);
+        q_data[2][i] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]);
+        q_data[3][i] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]);
+        q_data[4][i] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]);
+        q_data[5][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
+        q_data[6][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
+      }
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/// libCEED Q-function for applying a mass + diffusion operator
+CEED_QFUNCTION(apply_mass_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  struct BuildContext *build_data = (struct BuildContext *)ctx;
+  // in[0], out[0] solution values with shape [1, 1, Q]
+  // in[1], out[1] solution gradients with shape [dim, 1, Q]
+  // in[2] is quadrature data with shape [num_components, Q]
+  const CeedScalar(*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
+
+  switch (build_data->dim) {
+    case 1: {
+      const CeedScalar *u = in[0], *ug = in[1];
+      CeedScalar       *v = out[0], *vg = out[1];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        v[i] = q_data[0][i] * u[i];
+
+        // Diffusion
+        vg[i] = q_data[1][i] * ug[i];
+      }
+    } break;
+    case 2: {
+      const CeedScalar *u               = in[0];
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
+      CeedScalar *v                     = out[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[1];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        v[i] = q_data[0][i] * u[i];
+
+        // Diffusion
+        // Read q_data (dXdxdXdx_T symmetric matrix)
+        // Stored in Voigt convention
+        // 1 3
+        // 3 2
+        const CeedScalar dXdxdXdx_T[2][2] = {
+            {q_data[1][i], q_data[3][i]},
+            {q_data[3][i], q_data[2][i]}
+        };
+
+        // j = direction of vg
+        for (int j = 0; j < 2; j++) {
+          vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j]);
+        }
+      }
+    } break;
+    case 3: {
+      const CeedScalar *u               = in[0];
+      const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
+      CeedScalar *v                     = out[0];
+      CeedScalar(*vg)[CEED_Q_VLA]       = (CeedScalar(*)[CEED_Q_VLA])out[1];
+
+      CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+        // Mass
+        v[i] = q_data[0][i] * u[i];
+
+        // Diffusion
+        // Read q_data (dXdxdXdx_T symmetric matrix)
+        // Stored in Voigt convention
+        // 1 6 5
+        // 6 2 4
+        // 5 4 3
+        const CeedScalar dXdxdXdx_T[3][3] = {
+            {q_data[1][i], q_data[6][i], q_data[5][i]},
+            {q_data[6][i], q_data[2][i], q_data[4][i]},
+            {q_data[5][i], q_data[4][i], q_data[3][i]}
+        };
+
+        // j = direction of vg
+        for (int j = 0; j < 3; j++) {
+          vg[j][i] = (ug[0][i] * dXdxdXdx_T[0][j] + ug[1][i] * dXdxdXdx_T[1][j] + ug[2][i] * dXdxdXdx_T[2][j]);
+        }
+      }
+    } break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
diff --git a/examples/python/qfunctions/qfunctions.c b/examples/python/qfunctions/qfunctions.c
new file mode 100644
index 0000000000..ee41a501a7
--- /dev/null
+++ b/examples/python/qfunctions/qfunctions.c
@@ -0,0 +1,22 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+// -----------------------------------------------------------------------------
+// Redefine QFunction Macro
+// -----------------------------------------------------------------------------
+#undef CEED_QFUNCTION
+#define CEED_QFUNCTION(name) extern int name
+
+// -----------------------------------------------------------------------------
+// QFunction Sources
+// -----------------------------------------------------------------------------
+#include "ex1-volume.h"
+#include "ex2-surface.h"
+#include "ex3-volume.h"
+
+// -----------------------------------------------------------------------------
diff --git a/examples/python/setup_qfunctions.py b/examples/python/setup_qfunctions.py
new file mode 100644
index 0000000000..8c337621e7
--- /dev/null
+++ b/examples/python/setup_qfunctions.py
@@ -0,0 +1,32 @@
+from setuptools import setup, Extension
+from sys import platform
+import os
+
+# Get CEED directory
+ceed_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Include directories
+include_dirs = [os.path.join(ceed_dir, "include")]
+
+# Library directories
+library_dirs = [os.path.join(ceed_dir, "lib")]
+
+# Source files
+sources = ["qfunctions/qfunctions.c"]
+
+# Compiler arguments
+extra_compile_args = []
+if platform == "linux" or platform == "linux2" or platform == "darwin":
+    extra_compile_args = ["-O3", "-march=native", "-std=c11"]
+
+# Define the extension module
+qfunctions = Extension("libceed_c_qfunctions",
+                       sources=sources,
+                       include_dirs=include_dirs,
+                       library_dirs=library_dirs,
+                       libraries=["ceed"],
+                       extra_compile_args=extra_compile_args)
+
+# Setup
+setup(name="libceed_c_qfunctions",
+      ext_modules=[qfunctions])
diff --git a/examples/python/tutorial-0-ceed.ipynb b/examples/python/tutorial-0-ceed.ipynb
index b1d712a552..801081154f 100644
--- a/examples/python/tutorial-0-ceed.ipynb
+++ b/examples/python/tutorial-0-ceed.ipynb
@@ -92,8 +92,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Similarly, if libCEED is built with GPU support, you can specify a GPU backend, e.g., `/gpu/occa` or `/gpu/cuda/gen`."
+    "Similarly, if libCEED is built with GPU support, you can specify a GPU backend, e.g., `/gpu/hip` or `/gpu/cuda/gen`."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/examples/python/tutorial-2-elemrestriction.ipynb b/examples/python/tutorial-2-elemrestriction.ipynb
index c9a9483288..6c8f8593a7 100644
--- a/examples/python/tutorial-2-elemrestriction.ipynb
+++ b/examples/python/tutorial-2-elemrestriction.ipynb
@@ -61,20 +61,20 @@
     "\n",
     "ceed = libceed.Ceed()\n",
     "\n",
-    "ne = 3\n",
+    "num_elem = 3\n",
     "\n",
-    "x = ceed.Vector(ne+1)\n",
-    "a = np.arange(10, 10 + ne+1, dtype=\"float64\")\n",
+    "x = ceed.Vector(num_elem+1)\n",
+    "a = np.arange(10, 10 + num_elem+1, dtype=\"float64\")\n",
     "x.set_array(a, cmode=libceed.USE_POINTER)\n",
     "\n",
-    "ind = np.zeros(2*ne, dtype=\"int32\")\n",
-    "for i in range(ne):\n",
-    "  ind[2*i+0] = i\n",
-    "  ind[2*i+1] = i+1\n",
+    "indices = np.zeros(2*num_elem, dtype=\"int32\")\n",
+    "for i in range(num_elem):\n",
+    "  indices[2*i+0] = i\n",
+    "  indices[2*i+1] = i+1\n",
     "    \n",
-    "r = ceed.ElemRestriction(ne, 2, 1, 1, ne+1, ind, cmode=libceed.USE_POINTER)\n",
+    "r = ceed.ElemRestriction(num_elem, 2, 1, 1, num_elem+1, indices, cmode=libceed.USE_POINTER)\n",
     "\n",
-    "y = ceed.Vector(2*ne)\n",
+    "y = ceed.Vector(2*num_elem)\n",
     "y.set_value(0)\n",
     "\n",
     "r.apply(x, y)\n",
@@ -100,17 +100,17 @@
     "# \n",
     "#  x -- o -- o -- x -- o -- o -- x -- o -- o -- x\n",
     "\n",
-    "ne = 3\n",
+    "num_elem = 3\n",
     "\n",
-    "ind = np.zeros(4*ne, dtype=\"int32\")\n",
+    "indices = np.zeros(4*num_elem, dtype=\"int32\")\n",
     "\n",
-    "for i in range(ne):\n",
-    "  ind[4*i+0] = i*3+0\n",
-    "  ind[4*i+1] = i*3+1\n",
-    "  ind[4*i+2] = i*3+2\n",
-    "  ind[4*i+3] = i*3+3\n",
+    "for i in range(num_elem):\n",
+    "  indices[4*i+0] = i*3+0\n",
+    "  indices[4*i+1] = i*3+1\n",
+    "  indices[4*i+2] = i*3+2\n",
+    "  indices[4*i+3] = i*3+3\n",
     "\n",
-    "r = ceed.ElemRestriction(ne, 4, 1, 1, 3*ne+1, ind, cmode=libceed.USE_POINTER)\n",
+    "r = ceed.ElemRestriction(num_elem, 4, 1, 1, 3*num_elem+1, indices, cmode=libceed.USE_POINTER)\n",
     "\n",
     "mult = r.get_multiplicity()\n",
     "\n",
@@ -141,17 +141,17 @@
     "#  x --  x |  x --  x |  x --  x\n",
     "# 10 -- 11 | 12 -- 13 | 14 -- 15\n",
     "\n",
-    "ne = 3\n",
+    "num_elem = 3\n",
     "\n",
-    "x = ceed.Vector(2*ne)\n",
-    "a = np.arange(10, 10 + 2*ne, dtype=\"float64\")\n",
+    "x = ceed.Vector(2*num_elem)\n",
+    "a = np.arange(10, 10 + 2*num_elem, dtype=\"float64\")\n",
     "x.set_array(a, cmode=libceed.USE_POINTER)\n",
     "\n",
     "strides = np.array([1, 2, 2], dtype=\"int32\")\n",
     "\n",
-    "r = ceed.StridedElemRestriction(ne, 2, 1, 2*ne, strides)\n",
+    "r = ceed.StridedElemRestriction(num_elem, 2, 1, 2*num_elem, strides)\n",
     "\n",
-    "y = ceed.Vector(2*ne)\n",
+    "y = ceed.Vector(2*num_elem)\n",
     "y.set_value(0)\n",
     "\n",
     "r.apply(x, y)\n",
@@ -177,11 +177,11 @@
     "# \n",
     "#  x -- x -- x -- x\n",
     "\n",
-    "ne = 3\n",
+    "num_elem = 3\n",
     "\n",
     "strides = np.array([1, 2, 2], dtype=\"int32\")\n",
     "\n",
-    "r = ceed.BlockedStridedElemRestriction(ne, 2, 2, 1, ne+1, strides)\n",
+    "r = ceed.BlockedStridedElemRestriction(num_elem, 2, 2, 1, 2*(num_elem+1), strides)\n",
     "\n",
     "print(r)"
    ]
@@ -233,22 +233,22 @@
     "# | 10-11-12-13-14        11-12-13-14-15 | 15-16-17-17-17        16-17-18-18-18 |\n",
     "# | e0 e1 e2 e3 e4        e0 e1 e2 e3 e4 | e0 e1 e2 e3 e4        e0 e1 e2 e3 e4 |\n",
     "\n",
-    "ne = 8\n",
-    "blksize = 5\n",
+    "num_elem = 8\n",
+    "block_size = 5\n",
     "\n",
-    "x = ceed.Vector(ne+1)\n",
-    "a = np.arange(10, 10 + ne+1, dtype=\"float64\")\n",
+    "x = ceed.Vector(num_elem+1)\n",
+    "a = np.arange(10, 10 + num_elem+1, dtype=\"float64\")\n",
     "x.set_array(a, cmode=libceed.USE_POINTER)\n",
     "\n",
-    "ind = np.zeros(2*ne, dtype=\"int32\")\n",
-    "for i in range(ne):\n",
-    "  ind[2*i+0] = i\n",
-    "  ind[2*i+1] = i+1\n",
+    "indices = np.zeros(2*num_elem, dtype=\"int32\")\n",
+    "for i in range(num_elem):\n",
+    "  indices[2*i+0] = i\n",
+    "  indices[2*i+1] = i+1\n",
     "\n",
-    "r = ceed.BlockedElemRestriction(ne, 2, blksize, 1, 1, ne+1, ind,\n",
+    "r = ceed.BlockedElemRestriction(num_elem, 2, block_size, 1, 1, num_elem+1, indices,\n",
     "                                cmode=libceed.USE_POINTER)\n",
     "\n",
-    "y = ceed.Vector(2*blksize*2)\n",
+    "y = ceed.Vector(2*block_size*2)\n",
     "y.set_value(0)\n",
     "\n",
     "r.apply(x, y)\n",
@@ -303,22 +303,22 @@
     "# | 15-16-17-17-17        16-17-18-18-18 |\n",
     "# | e0 e1 e2 e3 e4        e0 e1 e2 e3 e4 |\n",
     "\n",
-    "ne = 8\n",
-    "blksize = 5\n",
+    "num_elem = 8\n",
+    "block_size = 5\n",
     "\n",
-    "x = ceed.Vector(ne+1)\n",
-    "a = np.arange(10, 10 + ne+1, dtype=\"float64\")\n",
+    "x = ceed.Vector(num_elem+1)\n",
+    "a = np.arange(10, 10 + num_elem+1, dtype=\"float64\")\n",
     "x.set_array(a, cmode=libceed.USE_POINTER)\n",
     "\n",
-    "ind = np.zeros(2*ne, dtype=\"int32\")\n",
-    "for i in range(ne):\n",
-    "  ind[2*i+0] = i\n",
-    "  ind[2*i+1] = i+1\n",
+    "indices = np.zeros(2*num_elem, dtype=\"int32\")\n",
+    "for i in range(num_elem):\n",
+    "  indices[2*i+0] = i\n",
+    "  indices[2*i+1] = i+1\n",
     "\n",
-    "r = ceed.BlockedElemRestriction(ne, 2, blksize, 1, 1, ne+1, ind,\n",
+    "r = ceed.BlockedElemRestriction(num_elem, 2, block_size, 1, 1, num_elem+1, indices,\n",
     "                                cmode=libceed.USE_POINTER)\n",
     "\n",
-    "y = ceed.Vector(blksize*2)\n",
+    "y = ceed.Vector(block_size*2)\n",
     "y.set_value(0)\n",
     "\n",
     "r.apply_block(1, x, y)\n",
@@ -343,7 +343,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -357,7 +357,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.13.2"
   }
  },
  "nbformat": 4,
diff --git a/examples/python/tutorial-3-basis.ipynb b/examples/python/tutorial-3-basis.ipynb
index a2141e4e9d..ef18be2789 100644
--- a/examples/python/tutorial-3-basis.ipynb
+++ b/examples/python/tutorial-3-basis.ipynb
@@ -63,11 +63,11 @@
     "    center += 0.1\n",
     "  return result\n",
     "\n",
-    "def feval(x1, x2):\n",
-    "  return x1*x1 + x2*x2 + x1*x2 + 1\n",
+    "def feval(x_1, x_2):\n",
+    "  return x_1*x_1 + x_2*x_2 + x_1*x_2 + 1\n",
     "\n",
-    "def dfeval(x1, x2):\n",
-    "  return 2*x1 + x2"
+    "def dfeval(x_1, x_2):\n",
+    "  return 2*x_1 + x_2"
    ]
   },
   {
@@ -112,24 +112,24 @@
    "outputs": [],
    "source": [
     "P = b.get_num_nodes()\n",
-    "nviz = 50\n",
-    "bviz = ceed.BasisTensorH1Lagrange(1, 1, P, nviz, libceed.GAUSS_LOBATTO)\n",
+    "Q_viz = 50\n",
+    "basis_viz = ceed.BasisTensorH1Lagrange(1, 1, P, Q_viz, libceed.GAUSS_LOBATTO)\n",
     "\n",
     "# Construct P \"elements\" with one node activated\n",
     "I = ceed.Vector(P * P)\n",
-    "with I.array(P, P) as x:\n",
+    "with I.array_write(P, P) as x:\n",
     "    x[...] = np.eye(P)\n",
     "\n",
-    "Bvander = ceed.Vector(P * nviz)\n",
-    "bviz.apply(4, libceed.EVAL_INTERP, I, Bvander)\n",
+    "basis_fns = ceed.Vector(P * Q_viz)\n",
+    "basis_viz.apply(4, libceed.EVAL_INTERP, I, basis_fns)\n",
     "\n",
-    "qviz, _weight = ceed.lobatto_quadrature(nviz)\n",
-    "with Bvander.array_read(nviz, P) as B:\n",
-    "    plt.plot(qviz, B)\n",
+    "qpts_viz, _ = ceed.lobatto_quadrature(Q_viz)\n",
+    "with basis_fns.array_read(Q_viz, P) as B_array:\n",
+    "    plt.plot(qpts_viz, B_array)\n",
     "\n",
     "# Mark tho Lobatto nodes\n",
-    "qb, _weight = ceed.lobatto_quadrature(P)\n",
-    "plt.plot(qb, 0*qb, 'ok');"
+    "nodes, _ = ceed.lobatto_quadrature(P)\n",
+    "plt.plot(nodes, 0*nodes, 'ok');"
    ]
   },
   {
@@ -148,11 +148,11 @@
     "b = ceed.BasisTensorH1Lagrange(1, 1, 4, 4, libceed.GAUSS)\n",
     "print(b)\n",
     "\n",
-    "with Bvander.array_read(nviz, P) as B:\n",
-    "    plt.plot(qviz, B)\n",
+    "with basis_fns.array_read(Q_viz, P) as B_array:\n",
+    "    plt.plot(qpts_viz, B_array)\n",
     "# Mark tho Gauss quadrature points\n",
-    "qb, _weight = ceed.gauss_quadrature(P)\n",
-    "plt.plot(qb, 0*qb, 'ok');"
+    "qpts, _ = ceed.gauss_quadrature(P)\n",
+    "plt.plot(qpts, 0*qpts, 'ok');"
    ]
   },
   {
@@ -193,54 +193,52 @@
    "source": [
     "for dim in range(1, 4):\n",
     "  Q = 4\n",
-    "  Qdim = Q**dim\n",
-    "  Xdim = 2**dim\n",
-    "  x = np.empty(Xdim*dim, dtype=\"float64\")\n",
-    "  uq = np.empty(Qdim, dtype=\"float64\")\n",
+    "  Q_dim = Q**dim\n",
+    "  X_dim = 2**dim\n",
+    "  x = np.empty(X_dim*dim, dtype=\"float64\")\n",
+    "  u_array = np.empty(Q_dim, dtype=\"float64\")\n",
     "\n",
     "  for d in range(dim):\n",
-    "    for i in range(Xdim):\n",
-    "      x[d*Xdim + i] = 1 if (i % (2**(dim-d))) // (2**(dim-d-1)) else -1\n",
+    "    for i in range(X_dim):\n",
+    "      x[d*X_dim + i] = 1 if (i % (2**(dim-d))) // (2**(dim-d-1)) else -1\n",
     "\n",
-    "  X = ceed.Vector(Xdim*dim)\n",
+    "  X = ceed.Vector(X_dim*dim)\n",
     "  X.set_array(x, cmode=libceed.USE_POINTER)\n",
-    "  Xq = ceed.Vector(Qdim*dim)\n",
-    "  Xq.set_value(0)\n",
-    "  U = ceed.Vector(Qdim)\n",
+    "  X_q = ceed.Vector(Q_dim*dim)\n",
+    "  X_q.set_value(0)\n",
+    "  U = ceed.Vector(Q_dim)\n",
     "  U.set_value(0)\n",
-    "  Uq = ceed.Vector(Qdim)\n",
+    "  U_q = ceed.Vector(Q_dim)\n",
     "\n",
-    "  bxl = ceed.BasisTensorH1Lagrange(dim, dim, 2, Q, libceed.GAUSS_LOBATTO)\n",
-    "  bul = ceed.BasisTensorH1Lagrange(dim, 1, Q, Q, libceed.GAUSS_LOBATTO)\n",
+    "  basis_x_lobatto = ceed.BasisTensorH1Lagrange(dim, dim, 2, Q, libceed.GAUSS_LOBATTO)\n",
+    "  basis_u_lobatto = ceed.BasisTensorH1Lagrange(dim, 1, Q, Q, libceed.GAUSS_LOBATTO)\n",
     "\n",
-    "  bxl.apply(1, libceed.EVAL_INTERP, X, Xq)\n",
+    "  basis_x_lobatto.apply(1, libceed.EVAL_INTERP, X, X_q)\n",
     "\n",
-    "  with Xq.array_read() as xq:\n",
-    "    for i in range(Qdim):\n",
-    "      xx = np.empty(dim, dtype=\"float64\")\n",
+    "  with X_q.array_read() as x_array:\n",
+    "    for i in range(Q_dim):\n",
+    "      x = np.empty(dim, dtype=\"float64\")\n",
     "      for d in range(dim):\n",
-    "        xx[d] = xq[d*Qdim + i]\n",
-    "      uq[i] = eval(dim, xx)\n",
+    "        x[d] = x_array[d*Q_dim + i]\n",
+    "      u_array[i] = eval(dim, x)\n",
     "\n",
-    "  Uq.set_array(uq, cmode=libceed.USE_POINTER)\n",
+    "  U_q.set_array(u_array, cmode=libceed.USE_POINTER)\n",
     "\n",
     "  # This operation is the identity because the quadrature is collocated\n",
-    "  bul.T.apply(1, libceed.EVAL_INTERP, Uq, U)\n",
+    "  basis_u_lobatto.T.apply(1, libceed.EVAL_INTERP, U_q, U)\n",
     "\n",
-    "  bxg = ceed.BasisTensorH1Lagrange(dim, dim, 2, Q, libceed.GAUSS)\n",
-    "  bug = ceed.BasisTensorH1Lagrange(dim, 1, Q, Q, libceed.GAUSS)\n",
+    "  basis_x_gauss = ceed.BasisTensorH1Lagrange(dim, dim, 2, Q, libceed.GAUSS)\n",
+    "  basis_u_gauss = ceed.BasisTensorH1Lagrange(dim, 1, Q, Q, libceed.GAUSS)\n",
     "\n",
-    "  bxg.apply(1, libceed.EVAL_INTERP, X, Xq)\n",
-    "  bug.apply(1, libceed.EVAL_INTERP, U, Uq)\n",
+    "  basis_x_gauss.apply(1, libceed.EVAL_INTERP, X, X_q)\n",
+    "  basis_u_gauss.apply(1, libceed.EVAL_INTERP, U, U_q)\n",
     "\n",
-    "  with Xq.array_read() as xq, Uq.array_read() as u:\n",
-    "    #print('xq =', xq)\n",
-    "    #print('u =', u)\n",
+    "  with X_q.array_read() as x_array, U_q.array_read() as u_array:\n",
     "    if dim == 2:\n",
     "        # Default ordering is contiguous in x direction, but\n",
     "        # pyplot expects meshgrid convention, which is transposed.\n",
-    "        x, y = xq.reshape(2, Q, Q).transpose(0, 2, 1)\n",
-    "        plt.scatter(x, y, c=np.array(u).reshape(Q, Q))\n",
+    "        x, y = x_array.reshape(2, Q, Q).transpose(0, 2, 1)\n",
+    "        plt.scatter(x, y, c=np.array(u_array).reshape(Q, Q))\n",
     "        plt.xlim(-1, 1)\n",
     "        plt.ylim(-1, 1)\n",
     "        plt.colorbar(label='u')"
@@ -261,62 +259,62 @@
    "source": [
     "for dim in range (1, 4):\n",
     "  P, Q = 8, 10\n",
-    "  Pdim = P**dim\n",
-    "  Qdim = Q**dim\n",
-    "  Xdim = 2**dim\n",
-    "  sum1 = sum2 = 0\n",
-    "  x = np.empty(Xdim*dim, dtype=\"float64\")\n",
-    "  u = np.empty(Pdim, dtype=\"float64\")\n",
+    "  P_dim = P**dim\n",
+    "  Q_dim = Q**dim\n",
+    "  X_dim = 2**dim\n",
+    "  sum_1 = sum_2 = 0\n",
+    "  x_array = np.empty(X_dim*dim, dtype=\"float64\")\n",
+    "  u_array = np.empty(P_dim, dtype=\"float64\")\n",
     "\n",
     "  for d in range(dim):\n",
-    "    for i in range(Xdim):\n",
-    "      x[d*Xdim + i] = 1 if (i % (2**(dim-d))) // (2**(dim-d-1)) else -1\n",
-    "\n",
-    "  X = ceed.Vector(Xdim*dim)\n",
-    "  X.set_array(x, cmode=libceed.USE_POINTER)\n",
-    "  Xq = ceed.Vector(Pdim*dim)\n",
-    "  Xq.set_value(0)\n",
-    "  U = ceed.Vector(Pdim)\n",
-    "  Uq = ceed.Vector(Qdim*dim)\n",
-    "  Uq.set_value(0)\n",
-    "  Ones = ceed.Vector(Qdim*dim)\n",
+    "    for i in range(X_dim):\n",
+    "      x_array[d*X_dim + i] = 1 if (i % (2**(dim-d))) // (2**(dim-d-1)) else -1\n",
+    "\n",
+    "  X = ceed.Vector(X_dim*dim)\n",
+    "  X.set_array(x_array, cmode=libceed.USE_POINTER)\n",
+    "  X_q = ceed.Vector(P_dim*dim)\n",
+    "  X_q.set_value(0)\n",
+    "  U = ceed.Vector(P_dim)\n",
+    "  U_q = ceed.Vector(Q_dim*dim)\n",
+    "  U_q.set_value(0)\n",
+    "  Ones = ceed.Vector(Q_dim*dim)\n",
     "  Ones.set_value(1)\n",
-    "  Gtposeones = ceed.Vector(Pdim)\n",
-    "  Gtposeones.set_value(0)\n",
+    "  G_transpose_ones = ceed.Vector(P_dim)\n",
+    "  G_transpose_ones.set_value(0)\n",
     "\n",
     "  # Get function values at quadrature points\n",
-    "  bxl = ceed.BasisTensorH1Lagrange(dim, dim, 2, P, libceed.GAUSS_LOBATTO)\n",
-    "  bxl.apply(1, libceed.EVAL_INTERP, X, Xq)\n",
+    "  basis_x_lobatto = ceed.BasisTensorH1Lagrange(dim, dim, 2, P, libceed.GAUSS_LOBATTO)\n",
+    "  basis_x_lobatto.apply(1, libceed.EVAL_INTERP, X, X_q)\n",
     "\n",
-    "  with Xq.array_read() as xq:\n",
-    "    for i in range(Pdim):\n",
-    "      xx = np.empty(dim, dtype=\"float64\")\n",
+    "  with X_q.array_read() as x_array:\n",
+    "    for i in range(P_dim):\n",
+    "      x = np.empty(dim, dtype=\"float64\")\n",
     "      for d in range(dim):\n",
-    "        xx[d] = xq[d*Pdim + i]\n",
-    "      u[i] = eval(dim, xx)\n",
+    "        x[d] = x_array[d*P_dim + i]\n",
+    "      u_array[i] = eval(dim, x)\n",
     "\n",
-    "  U.set_array(u, cmode=libceed.USE_POINTER)\n",
+    "  U.set_array(u_array, cmode=libceed.USE_POINTER)\n",
     "\n",
     "  # Calculate G u at quadrature points, G' * 1 at dofs\n",
-    "  bug = ceed.BasisTensorH1Lagrange(dim, 1, P, Q, libceed.GAUSS)\n",
-    "  bug.apply(1, libceed.EVAL_GRAD, U, Uq)\n",
-    "  bug.T.apply(1, libceed.EVAL_GRAD, Ones, Gtposeones)\n",
+    "  basis_u_gauss = ceed.BasisTensorH1Lagrange(dim, 1, P, Q, libceed.GAUSS)\n",
+    "  basis_u_gauss.apply(1, libceed.EVAL_GRAD, U, U_q)\n",
+    "  basis_u_gauss.T.apply(1, libceed.EVAL_GRAD, Ones, G_transpose_ones)\n",
     "\n",
     "  # Check if 1' * G * u = u' * (G' * 1)\n",
-    "  with Gtposeones.array_read() as gtposeones, Uq.array_read() as uq:\n",
-    "    for i in range(Pdim):\n",
-    "      sum1 += gtposeones[i]*u[i]\n",
-    "    for i in range(dim*Qdim):\n",
-    "      sum2 += uq[i]\n",
+    "  with G_transpose_ones.array_read() as g_array, U_q.array_read() as uq_array:\n",
+    "    for i in range(P_dim):\n",
+    "      sum_1 += g_array[i]*u_array[i]\n",
+    "    for i in range(dim*Q_dim):\n",
+    "      sum_2 += uq_array[i]\n",
     "\n",
     "  # Check that (1' * G * u - u' * (G' * 1)) is numerically zero\n",
-    "  print('1T * G * u - uT * (GT * 1) =', np.abs(sum1 - sum2))"
+    "  print('1T * G * u - uT * (GT * 1) =', np.abs(sum_1 - sum_2))"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -330,7 +328,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.13.2"
   }
  },
  "nbformat": 4,
diff --git a/examples/python/tutorial-4-qfunction.ipynb b/examples/python/tutorial-4-qfunction.ipynb
index d6495e241e..9aba23d2fc 100644
--- a/examples/python/tutorial-4-qfunction.ipynb
+++ b/examples/python/tutorial-4-qfunction.ipynb
@@ -189,7 +189,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -203,7 +203,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.13.2"
   }
  },
  "nbformat": 4,
diff --git a/examples/python/tutorial-5-operator.ipynb b/examples/python/tutorial-5-operator.ipynb
index 123a59836c..bb756a42a7 100644
--- a/examples/python/tutorial-5-operator.ipynb
+++ b/examples/python/tutorial-5-operator.ipynb
@@ -56,41 +56,41 @@
     "\n",
     "ceed = libceed.Ceed()\n",
     "\n",
-    "nelem = 15\n",
+    "num_elem = 15\n",
     "p = 5\n",
     "q = 8\n",
-    "nx = nelem + 1\n",
-    "nu = nelem*(p-1) + 1\n",
+    "num_x = num_elem + 1\n",
+    "num_u = num_elem*(p-1) + 1\n",
     "\n",
     "# Vectors\n",
-    "x = ceed.Vector(nx)\n",
-    "x_array = np.zeros(nx)\n",
-    "for i in range(nx):\n",
-    "  x_array[i] = i / (nx - 1.0)\n",
+    "x = ceed.Vector(num_x)\n",
+    "x_array = np.zeros(num_x)\n",
+    "for i in range(num_x):\n",
+    "  x_array[i] = i / (num_x - 1.0)\n",
     "x.set_array(x_array, cmode=libceed.USE_POINTER)\n",
     "\n",
-    "qdata = ceed.Vector(nelem*q)\n",
-    "u = ceed.Vector(nu)\n",
-    "v = ceed.Vector(nu)\n",
+    "q_data = ceed.Vector(num_elem*q)\n",
+    "u = ceed.Vector(num_u)\n",
+    "v = ceed.Vector(num_u)\n",
     "\n",
     "# Restrictions\n",
-    "indx = np.zeros(nx*2, dtype=\"int32\")\n",
-    "for i in range(nx):\n",
-    "  indx[2*i+0] = i\n",
-    "  indx[2*i+1] = i+1\n",
-    "rx = ceed.ElemRestriction(nelem, 2, 1, 1, nx, indx, cmode=libceed.USE_POINTER)\n",
-    "\n",
-    "indu = np.zeros(nelem*p, dtype=\"int32\")\n",
-    "for i in range(nelem):\n",
+    "indices_x = np.zeros(num_x*2, dtype=\"int32\")\n",
+    "for i in range(num_x):\n",
+    "  indices_x[2*i+0] = i\n",
+    "  indices_x[2*i+1] = i+1\n",
+    "restriction_x = ceed.ElemRestriction(num_elem, 2, 1, 1, num_x, indices_x, cmode=libceed.USE_POINTER)\n",
+    "\n",
+    "indices_u = np.zeros(num_elem*p, dtype=\"int32\")\n",
+    "for i in range(num_elem):\n",
     "  for j in range(p):\n",
-    "    indu[p*i+j] = i*(p-1) + j\n",
-    "ru = ceed.ElemRestriction(nelem, p, 1, 1, nu, indu, cmode=libceed.USE_POINTER)\n",
+    "    indices_u[p*i+j] = i*(p-1) + j\n",
+    "restriction_u = ceed.ElemRestriction(num_elem, p, 1, 1, num_u, indices_u, cmode=libceed.USE_POINTER)\n",
     "strides = np.array([1, q, q], dtype=\"int32\")\n",
-    "rui = ceed.StridedElemRestriction(nelem, q, 1, q*nelem, strides)\n",
+    "restriction_q_data = ceed.StridedElemRestriction(num_elem, q, 1, q*num_elem, strides)\n",
     "\n",
     "# Bases\n",
-    "bx = ceed.BasisTensorH1Lagrange(1, 1, 2, q, libceed.GAUSS)\n",
-    "bu = ceed.BasisTensorH1Lagrange(1, 1, p, q, libceed.GAUSS)\n",
+    "basis_x = ceed.BasisTensorH1Lagrange(1, 1, 2, q, libceed.GAUSS)\n",
+    "basis_u = ceed.BasisTensorH1Lagrange(1, 1, p, q, libceed.GAUSS)\n",
     "\n",
     "# QFunctions\n",
     "qf_setup = ceed.QFunctionByName(\"Mass1DBuild\")\n",
@@ -98,24 +98,24 @@
     "\n",
     "# Setup operator\n",
     "op_setup = ceed.Operator(qf_setup)\n",
-    "op_setup.set_field(\"dx\", rx, bx, libceed.VECTOR_ACTIVE)\n",
-    "op_setup.set_field(\"weights\", libceed.ELEMRESTRICTION_NONE, bx,\n",
+    "op_setup.set_field(\"dx\", restriction_x, basis_x, libceed.VECTOR_ACTIVE)\n",
+    "op_setup.set_field(\"weights\", libceed.ELEMRESTRICTION_NONE, basis_x,\n",
     "                   libceed.VECTOR_NONE)\n",
-    "op_setup.set_field(\"qdata\", rui, libceed.BASIS_NONE,\n",
+    "op_setup.set_field(\"qdata\", restriction_q_data, libceed.BASIS_NONE,\n",
     "                   libceed.VECTOR_ACTIVE)\n",
     "op_setup.check()\n",
     "print('Setup operator: ', op_setup)\n",
     "\n",
     "# Mass operator\n",
     "op_mass = ceed.Operator(qf_mass)\n",
-    "op_mass.set_field(\"u\", ru, bu, libceed.VECTOR_ACTIVE)\n",
-    "op_mass.set_field(\"qdata\", rui, libceed.BASIS_NONE, qdata)\n",
-    "op_mass.set_field(\"v\", ru, bu, libceed.VECTOR_ACTIVE)\n",
+    "op_mass.set_field(\"u\", restriction_u, basis_u, libceed.VECTOR_ACTIVE)\n",
+    "op_mass.set_field(\"qdata\", restriction_q_data, libceed.BASIS_NONE, q_data)\n",
+    "op_mass.set_field(\"v\", restriction_u, basis_u, libceed.VECTOR_ACTIVE)\n",
     "op_mass.check()\n",
     "print('Mass operator: ', op_mass)\n",
     "\n",
     "# Setup\n",
-    "op_setup.apply(x, qdata)\n",
+    "op_setup.apply(x, q_data)\n",
     "\n",
     "# Apply mass matrix\n",
     "u.set_value(1)\n",
@@ -125,11 +125,103 @@
     "with v.array_read() as v_array:\n",
     "  print('The length of the domain is l = %4.2f'%np.sum(v_array))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* In the next example, we create and apply a CeedOperator for the Poisson operator in 1D. By applying this operator to a vector with a linear function, we compute the 'surface area' of this 1D domain, similar to Ex2-Surface in the [tutorial-6-shell tutorial](./tutorial-6-shell.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import libceed\n",
+    "import numpy as np\n",
+    "\n",
+    "ceed = libceed.Ceed()\n",
+    "\n",
+    "num_elem = 15\n",
+    "p = 5\n",
+    "q = 8\n",
+    "num_x = num_elem + 1\n",
+    "num_u = num_elem*(p-1) + 1\n",
+    "\n",
+    "# Vectors\n",
+    "x = ceed.Vector(num_x)\n",
+    "x_array = np.zeros(num_x)\n",
+    "for i in range(num_x):\n",
+    "  x_array[i] = i / (num_x - 1.0)\n",
+    "x.set_array(x_array, cmode=libceed.USE_POINTER)\n",
+    "\n",
+    "q_data = ceed.Vector(num_elem*q)\n",
+    "u = ceed.Vector(num_u)\n",
+    "v = ceed.Vector(num_u)\n",
+    "\n",
+    "# Restrictions\n",
+    "indices_x = np.zeros(num_x*2, dtype=\"int32\")\n",
+    "for i in range(num_x):\n",
+    "  indices_x[2*i+0] = i\n",
+    "  indices_x[2*i+1] = i+1\n",
+    "restriction_x = ceed.ElemRestriction(num_elem, 2, 1, 1, num_x, indices_x, cmode=libceed.USE_POINTER)\n",
+    "\n",
+    "indices_u = np.zeros(num_elem*p, dtype=\"int32\")\n",
+    "for i in range(num_elem):\n",
+    "  for j in range(p):\n",
+    "    indices_u[p*i+j] = i*(p-1) + j\n",
+    "restriction_u = ceed.ElemRestriction(num_elem, p, 1, 1, num_u, indices_u, cmode=libceed.USE_POINTER)\n",
+    "strides = np.array([1, q, q], dtype=\"int32\")\n",
+    "restriction_q_data = ceed.StridedElemRestriction(num_elem, q, 1, q*num_elem, strides)\n",
+    "\n",
+    "# Bases\n",
+    "basis_x = ceed.BasisTensorH1Lagrange(1, 1, 2, q, libceed.GAUSS)\n",
+    "basis_u = ceed.BasisTensorH1Lagrange(1, 1, p, q, libceed.GAUSS)\n",
+    "\n",
+    "# QFunctions\n",
+    "qf_setup = ceed.QFunctionByName(\"Poisson1DBuild\")\n",
+    "qf_mass = ceed.QFunctionByName(\"Poisson1DApply\")\n",
+    "\n",
+    "# Setup operator\n",
+    "op_setup = ceed.Operator(qf_setup)\n",
+    "op_setup.set_field(\"dx\", restriction_x, basis_x, libceed.VECTOR_ACTIVE)\n",
+    "op_setup.set_field(\"weights\", libceed.ELEMRESTRICTION_NONE, basis_x,\n",
+    "                   libceed.VECTOR_NONE)\n",
+    "op_setup.set_field(\"qdata\", restriction_q_data, libceed.BASIS_NONE,\n",
+    "                   libceed.VECTOR_ACTIVE)\n",
+    "op_setup.check()\n",
+    "print('Setup operator: ', op_setup)\n",
+    "\n",
+    "# Poisson operator\n",
+    "op_poisson = ceed.Operator(qf_mass)\n",
+    "op_poisson.set_field(\"du\", restriction_u, basis_u, libceed.VECTOR_ACTIVE)\n",
+    "op_poisson.set_field(\"qdata\", restriction_q_data, libceed.BASIS_NONE, q_data)\n",
+    "op_poisson.set_field(\"dv\", restriction_u, basis_u, libceed.VECTOR_ACTIVE)\n",
+    "op_poisson.check()\n",
+    "print('Poisson operator: ', op_poisson)\n",
+    "\n",
+    "# Setup\n",
+    "op_setup.apply(x, q_data)\n",
+    "\n",
+    "# Apply Poisson operator\n",
+    "with u.array_write() as u_array:\n",
+    "  [points, _] = ceed.lobatto_quadrature(p)\n",
+    "  for elem in range(num_elem):\n",
+    "      for point in range(p):\n",
+    "          u_array[elem * (p - 1) + point] = (1.0 + 2.0 * elem + points[point])/(2.0 * num_elem)\n",
+    "op_poisson.apply(u, v)\n",
+    "\n",
+    "# Check\n",
+    "with v.array_read() as v_array:\n",
+    "  print('The surface area of the domain is dl = %4.2f'%np.sum(abs(v_array)))"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -143,7 +235,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.13.2"
   }
  },
  "nbformat": 4,
diff --git a/examples/rust-qfunctions/.gitignore b/examples/rust-qfunctions/.gitignore
new file mode 100644
index 0000000000..f2ceaf60f1
--- /dev/null
+++ b/examples/rust-qfunctions/.gitignore
@@ -0,0 +1,2 @@
+ex1-volume
+temp/*
diff --git a/examples/rust-qfunctions/Makefile b/examples/rust-qfunctions/Makefile
new file mode 100644
index 0000000000..2fba76706a
--- /dev/null
+++ b/examples/rust-qfunctions/Makefile
@@ -0,0 +1,35 @@
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
+# All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of CEED:  http://github.com/ceed
+
+OPT ?= -O -g
+
+# Ceed directory
+CEED_DIR ?= ../..
+CEED_FLAGS ?= -I$(CEED_DIR)/include -std=c11  $(OPT)
+CEED_LIBS ?= -Wl,-rpath,$(abspath $(CEED_DIR)/lib) -L$(CEED_DIR)/lib -lceed -L$(CEED_DIR)/examples/ceed -lm
+
+EXAMPLES.c = $(wildcard ex*.c)
+EXAMPLES = $(EXAMPLES.c:%.c=%)
+
+.SUFFIXES:
+.SUFFIXES: .c
+.PHONY: all clean
+
+all: $(EXAMPLES)
+
+# Remove built-in rules
+%: %.c
+
+# Special build rule for example 1 (rust)
+ex1-volume: ex1-volume.c
+	cargo +nightly build --release --manifest-path ex1-volume-rs/Cargo.toml --config ex1-volume-rs/.cargo/config.toml
+	$(LINK.c) $(CEED_FLAGS) $(CEED_LDFLAGS) $(abspath $<) -o $@ $(CEED_LIBS) -L$(CEED_DIR)/examples/rust-qfunctions/ex1-volume-rs/target/release -lex1_volume_rs
+
+clean:
+	rm -f *~ $(EXAMPLES)
+	rm -rf temp/
+	rm -rf *.dSYM *.TVD.*breakpoints
diff --git a/examples/rust-qfunctions/ex1-volume-rs/.cargo/config.toml b/examples/rust-qfunctions/ex1-volume-rs/.cargo/config.toml
new file mode 100644
index 0000000000..ca727ba27d
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume-rs/.cargo/config.toml
@@ -0,0 +1,6 @@
+[target.nvptx64-nvidia-cuda]
+rustflags = [
+  "-C", "linker-plugin-lto",
+]
+[unstable]
+build-std = ["panic_abort","core", "alloc"]
diff --git a/examples/rust-qfunctions/ex1-volume-rs/.gitignore b/examples/rust-qfunctions/ex1-volume-rs/.gitignore
new file mode 100644
index 0000000000..20a838f835
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume-rs/.gitignore
@@ -0,0 +1,3 @@
+target
+registry
+Cargo.lock
diff --git a/examples/rust-qfunctions/ex1-volume-rs/Cargo.toml b/examples/rust-qfunctions/ex1-volume-rs/Cargo.toml
new file mode 100644
index 0000000000..afc2f3b200
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume-rs/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "ex1-volume-rs"
+version = "0.1.0"
+edition = "2021"
+
+[profile.dev]
+panic = "abort"
+
+[profile.release]
+panic = "abort"
+
+# Compiles the crate as a lib (for GPU) and staticlib (for CPU)
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+ndarray = {version = "0.16.1", default-features = false}
diff --git a/examples/rust-qfunctions/ex1-volume-rs/rust-toolchain.toml b/examples/rust-qfunctions/ex1-volume-rs/rust-toolchain.toml
new file mode 100644
index 0000000000..5d56faf9ae
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume-rs/rust-toolchain.toml
@@ -0,0 +1,2 @@
+[toolchain]
+channel = "nightly"
diff --git a/examples/rust-qfunctions/ex1-volume-rs/src/lib.rs b/examples/rust-qfunctions/ex1-volume-rs/src/lib.rs
new file mode 100644
index 0000000000..8f2a36dfc9
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume-rs/src/lib.rs
@@ -0,0 +1,124 @@
+#![no_std]
+#![allow(internal_features)]
+#![feature(asm_experimental_arch, abi_ptx, core_intrinsics)]
+use core::ffi::c_void;
+use core::intrinsics::abort;
+use core::panic::PanicInfo;
+
+use ndarray::ArrayView;
+
+// This is a dummy allocator that always returns null. Heap allocations do not work on GPUs
+use core::alloc::{GlobalAlloc, Layout};
+pub struct Allocator;
+unsafe impl GlobalAlloc for Allocator {
+    unsafe fn alloc(&self, _layout: Layout) -> *mut u8 {
+        0 as *mut u8
+    }
+    unsafe fn dealloc(&self, _ptr: *mut u8, _layout: Layout) {
+        abort(); // since we never allocate
+    }
+}
+#[global_allocator]
+static GLOBAL_ALLOCATOR: Allocator = Allocator;
+
+// This is a copy of the same data structure defined in the .h file. It can be autogenerated using bindgen/cbindgen
+#[doc = " A structure used to pass additional data to f_build_mass"]
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct BuildContext {
+    pub dim: i32,
+    pub space_dim: i32,
+}
+
+// On no_std targets, its required to implement your own panic function.
+#[panic_handler]
+fn panic(_info: &PanicInfo) -> ! {
+    abort()
+}
+
+/* The no_mangle is required because rust "mangles" names (changes them to prevent namespace conflicts)
+Also note that this function ends in _rs, even though the C call `CEED_QFUNCTION_RUST(build_mass)` doesn't */
+#[no_mangle]
+pub unsafe extern "C" fn build_mass_rs(
+    ctx: *mut c_void,
+    q: i32,
+    in_: *const *const f64,
+    out: *mut *mut f64,
+) -> i8 {
+    let ctx: *mut BuildContext = unsafe { core::mem::transmute(ctx) };
+    let ctx: &mut BuildContext = &mut *ctx;
+
+    let in_slice = core::slice::from_raw_parts(in_, 2);
+
+    // in_slice[0] is Jacobians with shape [dim, dim, Q]
+    // in_slice[1] is quadrature weights with shape [1, Q]
+    let j_ptr = in_slice[0];
+    let w_ptr = in_slice[1];
+
+    let j = ArrayView::from_shape_ptr((ctx.dim as usize, ctx.dim as usize, q as usize), j_ptr);
+
+    let w = core::slice::from_raw_parts(w_ptr, q as usize);
+
+    let out_slice = core::slice::from_raw_parts_mut(out, 1);
+    let q_data = core::slice::from_raw_parts_mut(out_slice[0], q as usize);
+
+    match ctx.dim * 10 + ctx.space_dim {
+        11 => {
+            // Quadrature Point Loop
+            for i in 0..q as usize {
+                q_data[i] = j[[0, 0, i]] * w[i];
+            }
+        }
+        22 => {
+            // Quadrature Point Loop
+            for i in 0..q as usize {
+                q_data[i] = (j[[0, 0, i]] * j[[1, 1, i]] - j[[0, 1, i]] * j[[1, 0, i]]) * w[i];
+            }
+        }
+        33 => {
+            // Quadrature Point Loop
+            for i in 0..q as usize {
+                q_data[i] = (j[[0, 0, i]]
+                    * (j[[1, 1, i]] * j[[2, 2, i]] - j[[1, 2, i]] * j[[2, 1, i]])
+                    - j[[0, 1, i]] * (j[[1, 0, i]] * j[[2, 2, i]] - j[[1, 2, i]] * j[[2, 0, i]])
+                    + j[[0, 2, i]] * (j[[1, 0, i]] * j[[2, 1, i]] - j[[1, 1, i]] * j[[2, 0, i]]))
+                    * w[i];
+            }
+        }
+        _ => {
+            abort();
+        }
+    }
+
+    0
+}
+
+/* The no_mangle is required because rust "mangles" names (changes them to prevent namespace conflicts)
+Also note that this function ends in _rs, even though the C call `CEED_QFUNCTION_RUST(apply_mass)` doesn't
+For FFI reasons, it is also required to include all parameters in this exact form, even if you don't use all of them*/
+#[no_mangle]
+pub unsafe extern "C" fn apply_mass_rs(
+    _ctx: *mut c_void,
+    q: i32,
+    in_: *const *const f64,
+    out: *mut *mut f64,
+) -> i8 {
+    let in_slice = core::slice::from_raw_parts(in_, 2);
+
+    let u_ptr = in_slice[0];
+    let q_data_ptr = in_slice[1];
+
+    let u = core::slice::from_raw_parts(u_ptr, q as usize);
+    let q_data = core::slice::from_raw_parts(q_data_ptr, q as usize);
+
+    let out_slice = core::slice::from_raw_parts_mut(out, 1);
+
+    let v_ptr = out_slice[0];
+    let v = core::slice::from_raw_parts_mut(v_ptr, q as usize);
+
+    for i in 0..q as usize {
+        v[i] = q_data[i] * u[i];
+    }
+
+    0
+}
diff --git a/examples/rust-qfunctions/ex1-volume.c b/examples/rust-qfunctions/ex1-volume.c
new file mode 100644
index 0000000000..906ab1aff7
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume.c
@@ -0,0 +1,439 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+//                             libCEED Example 1
+//
+// This example illustrates a simple usage of libCEED to compute the volume of a 3D body using matrix-free application of a mass operator.
+// Arbitrary mesh and solution degrees in 1D, 2D and 3D are supported from the same code.
+//
+// The example has no dependencies, and is designed to be self-contained.
+// For additional examples that use external discretization libraries (MFEM, PETSc, etc.) see the subdirectories in libceed/examples.
+//
+// All libCEED objects use a Ceed device object constructed based on a command line argument (-ceed).
+//
+// Build with:
+//
+//     make ex1-volume-rust [CEED_DIR=</path/to/libceed>]
+//
+// Sample runs:
+//
+//     ./ex1-volume
+//     ./ex1-volume -ceed /cpu/self
+//     ./ex1-volume -ceed /gpu/cuda
+//
+// Test in 1D-3D
+//TESTARGS(name="1D User QFunction") -ceed {ceed_resource} -d 1 -t
+//TESTARGS(name="2D User QFunction") -ceed {ceed_resource} -d 2 -t
+//TESTARGS(name="3D User QFunction") -ceed {ceed_resource} -d 3 -t
+//TESTARGS(name="1D Gallery QFunction") -ceed {ceed_resource} -d 1 -t -g
+//TESTARGS(name="2D Gallery QFunction") -ceed {ceed_resource} -d 2 -t -g
+//TESTARGS(name="3D Gallery QFunction") -ceed {ceed_resource} -d 3 -t -g
+
+/// @file
+/// libCEED example using mass operator to compute volume
+
+#include "ex1-volume.h"
+
+#include <ceed.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// Auxiliary functions
+int        GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]);
+int        BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts,
+                                     CeedElemRestriction *restriction, CeedElemRestriction *q_data_restriction);
+int        SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords);
+CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords);
+
+// Main example
+int main(int argc, const char *argv[]) {
+  const char *ceed_spec   = "/cpu/self";
+  CeedInt     dim         = 3;               // dimension of the mesh
+  CeedInt     num_comp_x  = 3;               // number of x components
+  CeedInt     mesh_degree = 4;               // polynomial degree for the mesh
+  CeedInt     sol_degree  = 4;               // polynomial degree for the solution
+  CeedInt     num_qpts    = sol_degree + 2;  // number of 1D quadrature points
+  CeedInt     prob_size   = -1;              // approximate problem size
+  CeedInt     help = 0, test = 0, gallery = 0, benchmark = 0;
+
+  // Process command line arguments.
+  for (int ia = 1; ia < argc; ia++) {
+    // LCOV_EXCL_START
+    int next_arg = ((ia + 1) < argc), parse_error = 0;
+    if (!strcmp(argv[ia], "-h")) {
+      help = 1;
+    } else if (!strcmp(argv[ia], "-c") || !strcmp(argv[ia], "-ceed")) {
+      parse_error = next_arg ? ceed_spec = argv[++ia], 0 : 1;
+    } else if (!strcmp(argv[ia], "-d")) {
+      parse_error = next_arg ? dim = atoi(argv[++ia]), 0 : 1;
+      num_comp_x                   = dim;
+    } else if (!strcmp(argv[ia], "-m")) {
+      parse_error = next_arg ? mesh_degree = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-p")) {
+      parse_error = next_arg ? sol_degree = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-q")) {
+      parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-s")) {
+      parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-b")) {
+      parse_error = next_arg ? benchmark = atoi(argv[++ia]), 0 : 1;
+    } else if (!strcmp(argv[ia], "-t")) {
+      test = 1;
+    } else if (!strcmp(argv[ia], "-g")) {
+      gallery = 1;
+    }
+    if (parse_error) {
+      printf("Error parsing command line options.\n");
+      return 1;
+    }
+    // LCOV_EXCL_STOP
+  }
+  if (prob_size < 0) prob_size = test ? 8 * 16 : 256 * 1024;
+
+  // Print the values of all options:
+  if (!test || help) {
+    // LCOV_EXCL_START
+    printf("Selected options: [command line option] : <current value>\n");
+    printf("  Ceed specification     [-c] : %s\n", ceed_spec);
+    printf("  Mesh dimension         [-d] : %" CeedInt_FMT "\n", dim);
+    printf("  Mesh degree            [-m] : %" CeedInt_FMT "\n", mesh_degree);
+    printf("  Solution degree        [-p] : %" CeedInt_FMT "\n", sol_degree);
+    printf("  Num. 1D quadrature pts [-q] : %" CeedInt_FMT "\n", num_qpts);
+    printf("  Approx. # unknowns     [-s] : %" CeedInt_FMT "\n", prob_size);
+    printf("  QFunction source       [-g] : %s\n", gallery ? "gallery" : "header");
+    if (help) {
+      printf("Test/quiet mode is %s\n", (test ? "ON" : "OFF (use -t to enable)"));
+      return 0;
+    }
+    printf("\n");
+    // LCOV_EXCL_STOP
+  }
+
+  // Select appropriate backend and logical device based on the (-ceed) command line argument.
+  Ceed ceed;
+
+  CeedInit(ceed_spec, &ceed);
+
+  // Add the path to the Rust crate to the ceed object.
+  {
+    char  root[2048] = __FILE__;
+    char *last_slash = strrchr(root, '/');
+
+    strncpy(last_slash + 1, "ex1-volume-rs", 14);
+    CeedAddRustSourceRoot(ceed, root);
+  }
+
+  // Construct the mesh and solution bases.
+  CeedBasis mesh_basis, sol_basis;
+
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis);
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis);
+
+  // Determine the mesh size based on the given approximate problem size.
+  CeedInt num_xyz[dim];
+
+  GetCartesianMeshSize(dim, sol_degree, prob_size, num_xyz);
+  if (!test) {
+    // LCOV_EXCL_START
+    printf("Mesh size: nx = %" CeedInt_FMT, num_xyz[0]);
+    if (dim > 1) printf(", ny = %" CeedInt_FMT, num_xyz[1]);
+    if (dim > 2) printf(", nz = %" CeedInt_FMT, num_xyz[2]);
+    printf("\n");
+    // LCOV_EXCL_STOP
+  }
+
+  // Build CeedElemRestriction objects describing the mesh and solution discrete representations.
+  CeedInt             mesh_size, sol_size;
+  CeedElemRestriction mesh_restriction, sol_restriction, q_data_restriction;
+
+  BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restriction, NULL);
+  BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restriction, &q_data_restriction);
+  if (!test) {
+    // LCOV_EXCL_START
+    printf("Number of mesh nodes     : %" CeedInt_FMT "\n", mesh_size / dim);
+    printf("Number of solution nodes : %" CeedInt_FMT "\n", sol_size);
+    // LCOV_EXCL_STOP
+  }
+
+  // Create a CeedVector with the mesh coordinates.
+  CeedVector mesh_coords;
+
+  CeedVectorCreate(ceed, mesh_size, &mesh_coords);
+  SetCartesianMeshCoords(dim, num_xyz, mesh_degree, mesh_coords);
+
+  // Apply a transformation to the mesh.
+  CeedScalar exact_volume = TransformMeshCoords(dim, mesh_size, mesh_coords);
+
+  // Context data to be passed to the 'build_mass' QFunction.
+  CeedQFunctionContext build_ctx;
+  struct BuildContext  build_ctx_data;
+
+  build_ctx_data.dim = build_ctx_data.space_dim = dim;
+  CeedQFunctionContextCreate(ceed, &build_ctx);
+  CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data);
+
+  // Create the QFunction that builds the mass operator (i.e. computes its quadrature data) and set its context data.
+  CeedQFunction qf_build;
+
+  if (gallery) {
+    // This creates the QFunction via the gallery.
+    char name[13] = "";
+    snprintf(name, sizeof name, "Mass%" CeedInt_FMT "DBuild", dim);
+    CeedQFunctionCreateInteriorByName(ceed, name, &qf_build);
+  } else {
+    // This creates the QFunction directly.
+    CeedQFunctionCreateInterior(ceed, 1, build_mass, build_mass_loc, &qf_build);
+    CeedQFunctionAddInput(qf_build, "dx", num_comp_x * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddOutput(qf_build, "qdata", 1, CEED_EVAL_NONE);
+    CeedQFunctionSetContext(qf_build, build_ctx);
+  }
+
+  // Create the operator that builds the quadrature data for the mass operator.
+  CeedOperator op_build;
+
+  CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build);
+  CeedOperatorSetField(op_build, "dx", mesh_restriction, mesh_basis, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_build, "qdata", q_data_restriction, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+
+  // Compute the quadrature data for the mass operator.
+  CeedVector q_data;
+  CeedInt    elem_qpts = CeedIntPow(num_qpts, dim);
+  CeedInt    num_elem  = 1;
+
+  for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d];
+  CeedVectorCreate(ceed, num_elem * elem_qpts, &q_data);
+  CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Create the QFunction that defines the action of the mass operator.
+  CeedQFunction qf_apply;
+
+  if (gallery) {
+    // This creates the QFunction via the gallery.
+    CeedQFunctionCreateInteriorByName(ceed, "MassApply", &qf_apply);
+  } else {
+    // This creates the QFunction directly.
+    CeedQFunctionCreateInterior(ceed, 1, apply_mass, apply_mass_loc, &qf_apply);
+    CeedQFunctionAddInput(qf_apply, "u", 1, CEED_EVAL_INTERP);
+    CeedQFunctionAddInput(qf_apply, "qdata", 1, CEED_EVAL_NONE);
+    CeedQFunctionAddOutput(qf_apply, "v", 1, CEED_EVAL_INTERP);
+  }
+
+  // Create the mass operator.
+  CeedOperator op_apply;
+
+  CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply);
+  CeedOperatorSetField(op_apply, "u", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_apply, "qdata", q_data_restriction, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_apply, "v", sol_restriction, sol_basis, CEED_VECTOR_ACTIVE);
+
+  // Create auxiliary solution-size vectors.
+  CeedVector u, v;
+
+  CeedVectorCreate(ceed, sol_size, &u);
+  CeedVectorCreate(ceed, sol_size, &v);
+
+  // Initialize 'u' with ones.
+  CeedVectorSetValue(u, 1.0);
+
+  // Compute the mesh volume using the mass operator: volume = 1^T \cdot M \cdot 1
+  CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
+
+  // Benchmark runs
+  if (!test && benchmark) {
+    // LCOV_EXCL_START
+    printf(" Executing %d benchmarking runs...\n", benchmark);
+    // LCOV_EXCL_STOP
+  }
+  for (CeedInt i = 0; i < benchmark; i++) {
+    // LCOV_EXCL_START
+    CeedOperatorApply(op_apply, u, v, CEED_REQUEST_IMMEDIATE);
+    // LCOV_EXCL_STOP
+  }
+
+  // Compute and print the sum of the entries of 'v' giving the mesh volume.
+  CeedScalar volume = 0.;
+
+  {
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < sol_size; i++) volume += v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+  if (!test) {
+    // LCOV_EXCL_START
+    printf(" done.\n");
+    printf("Exact mesh volume    : % .14g\n", exact_volume);
+    printf("Computed mesh volume : % .14g\n", volume);
+    printf("Volume error         : % .14g\n", volume - exact_volume);
+    // LCOV_EXCL_STOP
+  } else {
+    CeedScalar tol = (dim == 1 ? 200. * CEED_EPSILON : dim == 2 ? 1E-5 : 1E-5);
+
+    if (fabs(volume - exact_volume) > tol) printf("Volume error : % .1e\n", volume - exact_volume);
+  }
+
+  // Free dynamically allocated memory.
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&mesh_coords);
+  CeedOperatorDestroy(&op_apply);
+  CeedQFunctionDestroy(&qf_apply);
+  CeedQFunctionContextDestroy(&build_ctx);
+  CeedOperatorDestroy(&op_build);
+  CeedQFunctionDestroy(&qf_build);
+  CeedElemRestrictionDestroy(&sol_restriction);
+  CeedElemRestrictionDestroy(&mesh_restriction);
+  CeedElemRestrictionDestroy(&q_data_restriction);
+  CeedBasisDestroy(&sol_basis);
+  CeedBasisDestroy(&mesh_basis);
+  CeedDestroy(&ceed);
+  return 0;
+}
+
+int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]) {
+  // Use the approximate formula:
+  //    prob_size ~ num_elem * degree^dim
+  CeedInt num_elem = prob_size / CeedIntPow(degree, dim);
+  CeedInt s        = 0;  // find s: num_elem/2 < 2^s <= num_elem
+
+  while (num_elem > 1) {
+    num_elem /= 2;
+    s++;
+  }
+  CeedInt r = s % dim;
+
+  for (CeedInt d = 0; d < dim; d++) {
+    CeedInt sd = s / dim;
+
+    if (r > 0) {
+      sd++;
+      r--;
+    }
+    num_xyz[d] = 1 << sd;
+  }
+  return 0;
+}
+
+int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts,
+                              CeedElemRestriction *restriction, CeedElemRestriction *q_data_restriction) {
+  CeedInt p         = degree + 1;
+  CeedInt num_nodes = CeedIntPow(p, dim);         // number of scalar nodes per element
+  CeedInt elem_qpts = CeedIntPow(num_qpts, dim);  // number of qpts per element
+  CeedInt nd[3], num_elem = 1, scalar_size = 1;
+
+  for (CeedInt d = 0; d < dim; d++) {
+    num_elem *= num_xyz[d];
+    nd[d] = num_xyz[d] * (p - 1) + 1;
+    scalar_size *= nd[d];
+  }
+  *size = scalar_size * num_comp;
+  // elem:         0             1                 n-1
+  //           |---*-...-*---|---*-...-*---|- ... -|--...--|
+  // num_nodes:   0   1    p-1  p  p+1       2*p             n*p
+  CeedInt *elem_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes);
+
+  for (CeedInt e = 0; e < num_elem; e++) {
+    CeedInt e_xyz[3] = {1, 1, 1}, re = e;
+
+    for (CeedInt d = 0; d < dim; d++) {
+      e_xyz[d] = re % num_xyz[d];
+      re /= num_xyz[d];
+    }
+    CeedInt *local_elem_nodes = elem_nodes + e * num_nodes;
+
+    for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) {
+      CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes;
+
+      for (CeedInt d = 0; d < dim; d++) {
+        g_nodes += (e_xyz[d] * (p - 1) + r_nodes % p) * g_nodes_stride;
+        g_nodes_stride *= nd[d];
+        r_nodes /= p;
+      }
+      local_elem_nodes[l_nodes] = g_nodes;
+    }
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, elem_nodes,
+                            restriction);
+  if (q_data_restriction) {
+    CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, CEED_STRIDES_BACKEND, q_data_restriction);
+  }
+  free(elem_nodes);
+  return 0;
+}
+
+int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords) {
+  CeedInt p = mesh_degree + 1;
+  CeedInt nd[3], scalar_size = 1;
+
+  for (CeedInt d = 0; d < dim; d++) {
+    nd[d] = num_xyz[d] * (p - 1) + 1;
+    scalar_size *= nd[d];
+  }
+  CeedScalar *coords;
+
+  CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords);
+  CeedScalar *nodes = malloc(sizeof(CeedScalar) * p);
+
+  // The H1 basis uses Lobatto quadrature points as nodes.
+  CeedLobattoQuadrature(p, nodes, NULL);  // nodes are in [-1,1]
+  for (CeedInt i = 0; i < p; i++) nodes[i] = 0.5 + 0.5 * nodes[i];
+  for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) {
+    CeedInt r_nodes = gs_nodes;
+
+    for (CeedInt d = 0; d < dim; d++) {
+      CeedInt d_1d = r_nodes % nd[d];
+
+      coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) + nodes[d_1d % (p - 1)]) / num_xyz[d];
+      r_nodes /= nd[d];
+    }
+  }
+  free(nodes);
+  CeedVectorRestoreArray(mesh_coords, &coords);
+  return 0;
+}
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#define M_PI_2 1.57079632679489661923
+#endif
+
+CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords) {
+  CeedScalar  exact_volume;
+  CeedScalar *coords;
+
+  CeedVectorGetArray(mesh_coords, CEED_MEM_HOST, &coords);
+  if (dim == 1) {
+    for (CeedInt i = 0; i < mesh_size; i++) {
+      // map [0,1] to [0,1] varying the mesh density
+      coords[i] = 0.5 + 1. / sqrt(3.) * sin((2. / 3.) * M_PI * (coords[i] - 0.5));
+    }
+    exact_volume = 1.;
+  } else {
+    CeedInt num_nodes = mesh_size / dim;
+
+    for (CeedInt i = 0; i < num_nodes; i++) {
+      // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
+      // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
+      CeedScalar u = coords[i], v = coords[i + num_nodes];
+
+      u                     = 1. + u;
+      v                     = M_PI_2 * v;
+      coords[i]             = u * cos(v);
+      coords[i + num_nodes] = u * sin(v);
+    }
+    exact_volume = 3. / 4. * M_PI;
+  }
+  CeedVectorRestoreArray(mesh_coords, &coords);
+  return exact_volume;
+}
diff --git a/examples/rust-qfunctions/ex1-volume.h b/examples/rust-qfunctions/ex1-volume.h
new file mode 100644
index 0000000000..1c2baa8fc2
--- /dev/null
+++ b/examples/rust-qfunctions/ex1-volume.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+/// A structure used to pass additional data to f_build_mass
+struct BuildContext {
+  CeedInt dim, space_dim;
+};
+
+// References the rust file for the qfunction named build_mass_rs
+CEED_QFUNCTION_RUST(build_mass)
+
+// References the rust file for the qfunction named apply_mass_rs
+CEED_QFUNCTION_RUST(apply_mass)
diff --git a/examples/rust/ex3-vector-volume/.gitignore b/examples/rust/ex1-volume-vector/.gitignore
similarity index 100%
rename from examples/rust/ex3-vector-volume/.gitignore
rename to examples/rust/ex1-volume-vector/.gitignore
diff --git a/examples/rust/ex3-vector-volume/Cargo.toml b/examples/rust/ex1-volume-vector/Cargo.toml
similarity index 91%
rename from examples/rust/ex3-vector-volume/Cargo.toml
rename to examples/rust/ex1-volume-vector/Cargo.toml
index 3bee448ac7..d3f5b74832 100644
--- a/examples/rust/ex3-vector-volume/Cargo.toml
+++ b/examples/rust/ex1-volume-vector/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "ex3-vector-volume"
+name = "ex1-volume-vector"
 version = "0.11.0"
 authors = [
     "Jeremy L Thompson <thompson.jeremy.luke@gmail.com>",
diff --git a/examples/rust/ex3-vector-volume/src/main.rs b/examples/rust/ex1-volume-vector/src/main.rs
similarity index 87%
rename from examples/rust/ex3-vector-volume/src/main.rs
rename to examples/rust/ex1-volume-vector/src/main.rs
index 9b3cced2b8..85921e688d 100644
--- a/examples/rust/ex3-vector-volume/src/main.rs
+++ b/examples/rust/ex1-volume-vector/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -20,20 +20,23 @@
 // line argument (-ceed).
 
 use clap::Parser;
-use libceed::{prelude::*, Ceed};
+use libceed::{
+    BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt,
+};
 mod opt;
 mod transform;
 
 // ----------------------------------------------------------------------------
 // Example 3
 // ----------------------------------------------------------------------------
-#[cfg(not(tarpaulin_include))]
 fn main() -> libceed::Result<()> {
     let options = opt::Opt::parse();
-    example_3(options)
+    example_1_vector(options)
 }
 
-fn example_3(options: opt::Opt) -> libceed::Result<()> {
+#[allow(clippy::erasing_op)]
+#[allow(clippy::identity_op)]
+fn example_1_vector(options: opt::Opt) -> libceed::Result<()> {
     // Process command line arguments
     let opt::Opt {
         ceed_spec,
@@ -46,17 +49,20 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
         quiet,
         gallery,
     } = options;
-    assert!(dim >= 1 && dim <= 3);
+    assert!((0..=3).contains(&dim));
     assert!(mesh_degree >= 1);
     assert!(solution_degree >= 1);
     assert!(num_qpts >= 1);
     let ncomp_x = dim;
-    let problem_size: i64;
-    if problem_size_requested < 0 {
-        problem_size = if test { 8 * 16 } else { 256 * 1024 };
+    let problem_size: i64 = if problem_size_requested < 0 {
+        if test {
+            8 * 16
+        } else {
+            256 * 1024
+        }
     } else {
-        problem_size = problem_size_requested;
-    }
+        problem_size_requested
+    };
     let ncomp_u = 3;
 
     // Summary output
@@ -78,14 +84,19 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
     let ceed = Ceed::init(&ceed_spec);
 
     // Mesh and solution bases
-    let basis_mesh =
-        ceed.basis_tensor_H1_Lagrange(dim, ncomp_x, mesh_degree + 1, num_qpts, QuadMode::Gauss)?;
+    let basis_mesh = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_x,
+        mesh_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
     let basis_solution = ceed.basis_tensor_H1_Lagrange(
         dim,
         ncomp_u,
         solution_degree + 1,
         num_qpts,
-        QuadMode::Gauss,
+        libceed::QuadMode::Gauss,
     )?;
 
     // Determine mesh size from approximate problem size
@@ -98,7 +109,7 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
         if dim > 2 {
             print!(", nz = {}", num_xyz[2]);
         }
-        print!("\n");
+        println!();
     }
 
     // Build ElemRestriction objects describing the mesh and solution discrete
@@ -167,9 +178,9 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_build_closure = ceed
         .q_function_interior(1, Box::new(build_mass))?
-        .input("dx", ncomp_x * dim, EvalMode::Grad)?
-        .input("weights", 1, EvalMode::Weight)?
-        .output("qdata", 1, EvalMode::None)?;
+        .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)?
+        .input("weights", 1, libceed::EvalMode::Weight)?
+        .output("qdata", 1, libceed::EvalMode::None)?;
     // -- QFunction from gallery
     let qf_build_named = {
         let name = format!("Mass{}DBuild", dim);
@@ -218,9 +229,9 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_mass_closure = ceed
         .q_function_interior(1, Box::new(apply_mass))?
-        .input("u", ncomp_u, EvalMode::Interp)?
-        .input("qdata", 1, EvalMode::None)?
-        .output("v", ncomp_u, EvalMode::Interp)?;
+        .input("u", ncomp_u, libceed::EvalMode::Interp)?
+        .input("qdata", 1, libceed::EvalMode::None)?
+        .output("v", ncomp_u, libceed::EvalMode::Interp)?;
     // -- QFunction from gallery
     let qf_mass_named = ceed.q_function_interior_by_name("Vector3MassApply")?;
     // -- QFunction for use with Operator
@@ -256,7 +267,7 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
     op_mass.apply(&u, &mut v)?;
 
     // Compute the mesh volume
-    let volume: Scalar = v.view()?.iter().sum::<libceed::Scalar>()
+    let volume: libceed::Scalar = v.view()?.iter().sum::<libceed::Scalar>()
         / ((ncomp_u * (ncomp_u + 1)) / 2) as libceed::Scalar;
 
     // Output results
@@ -269,7 +280,7 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> {
         );
     }
     let tolerance = match dim {
-        1 => 100.0 * libceed::EPSILON,
+        1 => 200.0 * libceed::EPSILON,
         _ => 1E-5,
     };
     let error = (volume - exact_volume).abs();
@@ -293,7 +304,7 @@ mod tests {
     use super::*;
 
     #[test]
-    fn example_3_1d() {
+    fn example_1_vector_1d() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 1,
@@ -305,11 +316,11 @@ mod tests {
             quiet: true,
             gallery: false,
         };
-        assert!(example_3(options).is_ok());
+        assert!(example_1_vector(options).is_ok());
     }
 
     #[test]
-    fn example_3_2d() {
+    fn example_1_vector_2d() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 2,
@@ -321,11 +332,11 @@ mod tests {
             quiet: true,
             gallery: false,
         };
-        assert!(example_3(options).is_ok());
+        assert!(example_1_vector(options).is_ok());
     }
 
     #[test]
-    fn example_3_3d() {
+    fn example_1_vector_3d() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 3,
@@ -337,11 +348,11 @@ mod tests {
             quiet: false,
             gallery: false,
         };
-        assert!(example_3(options).is_ok());
+        assert!(example_1_vector(options).is_ok());
     }
 
     #[test]
-    fn example_3_1d_gallery() {
+    fn example_1_vector_1d_gallery() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 1,
@@ -353,11 +364,11 @@ mod tests {
             quiet: true,
             gallery: true,
         };
-        assert!(example_3(options).is_ok());
+        assert!(example_1_vector(options).is_ok());
     }
 
     #[test]
-    fn example_3_2d_gallery() {
+    fn example_1_vector_2d_gallery() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 2,
@@ -369,11 +380,11 @@ mod tests {
             quiet: true,
             gallery: true,
         };
-        assert!(example_3(options).is_ok());
+        assert!(example_1_vector(options).is_ok());
     }
 
     #[test]
-    fn example_3_3d_gallery() {
+    fn example_1_vector_3d_gallery() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 3,
@@ -385,7 +396,7 @@ mod tests {
             quiet: true,
             gallery: true,
         };
-        assert!(example_3(options).is_ok());
+        assert!(example_1_vector(options).is_ok());
     }
 }
 
diff --git a/examples/rust/ex3-vector-volume/src/opt.rs b/examples/rust/ex1-volume-vector/src/opt.rs
similarity index 95%
rename from examples/rust/ex3-vector-volume/src/opt.rs
rename to examples/rust/ex1-volume-vector/src/opt.rs
index 5de7c68f08..37cbf0a3c0 100644
--- a/examples/rust/ex3-vector-volume/src/opt.rs
+++ b/examples/rust/ex1-volume-vector/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -15,7 +15,6 @@ use clap::Parser;
     name = "libCEED Rust Example 3 - Vector Volume",
     about = "This example uses the mass matrix to compute the length, area, or volume of a region in triplicate, depending upon runtime parameters."
 )]
-#[cfg(not(tarpaulin_include))]
 pub(crate) struct Opt {
     /// libCEED backend resource to use
     #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")]
diff --git a/examples/rust/ex3-vector-volume/src/transform.rs b/examples/rust/ex1-volume-vector/src/transform.rs
similarity index 72%
rename from examples/rust/ex3-vector-volume/src/transform.rs
rename to examples/rust/ex1-volume-vector/src/transform.rs
index 6ebe14bc6f..7073937353 100644
--- a/examples/rust/ex3-vector-volume/src/transform.rs
+++ b/examples/rust/ex1-volume-vector/src/transform.rs
@@ -1,27 +1,25 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-use libceed::prelude::*;
-
 // ----------------------------------------------------------------------------
 // Transform mesh coordinates
 // ----------------------------------------------------------------------------
 pub(crate) fn transform_mesh_coordinates(
     dim: usize,
     mesh_size: usize,
-    mesh_coords: &mut Vector,
-) -> libceed::Result<Scalar> {
+    mesh_coords: &mut libceed::Vector,
+) -> libceed::Result<libceed::Scalar> {
     // Transform coordinates
     if dim == 1 {
         for coord in mesh_coords.view_mut()?.iter_mut() {
             // map [0,1] to [0,1] varying the mesh density
             *coord = 0.5
-                + 1.0 / (3.0 as Scalar).sqrt()
-                    * ((2.0 / 3.0) * std::f64::consts::PI as Scalar * (*coord - 0.5)).sin()
+                + 1.0 / (3.0 as libceed::Scalar).sqrt()
+                    * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)).sin()
         }
     } else {
         let mut coords = mesh_coords.view_mut()?;
@@ -30,7 +28,7 @@ pub(crate) fn transform_mesh_coordinates(
             // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
             // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
             let u = 1.0 + coords[i];
-            let v = std::f64::consts::PI as Scalar / 2.0 * coords[i + num_nodes];
+            let v = std::f64::consts::PI as libceed::Scalar / 2.0 * coords[i + num_nodes];
             coords[i] = u * v.cos();
             coords[i + num_nodes] = u * v.sin();
         }
@@ -39,7 +37,8 @@ pub(crate) fn transform_mesh_coordinates(
     // Exact volume of transformed region
     let exact_volume = match dim {
         1 => 1.0,
-        _ => 3.0 / 4.0 * std::f64::consts::PI as Scalar,
+        2 | 3 => 3.0 / 4.0 * std::f64::consts::PI as libceed::Scalar,
+        _ => unreachable!(),
     };
     Ok(exact_volume)
 }
diff --git a/examples/rust/ex1-volume/src/main.rs b/examples/rust/ex1-volume/src/main.rs
index bea2e2f79c..9020fb270c 100644
--- a/examples/rust/ex1-volume/src/main.rs
+++ b/examples/rust/ex1-volume/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -19,19 +19,22 @@
 // line argument (-ceed).
 
 use clap::Parser;
-use libceed::{prelude::*, Ceed};
+use libceed::{
+    BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt,
+};
 mod opt;
 mod transform;
 
 // ----------------------------------------------------------------------------
 // Example 1
 // ----------------------------------------------------------------------------
-#[cfg(not(tarpaulin_include))]
 fn main() -> libceed::Result<()> {
     let options = opt::Opt::parse();
     example_1(options)
 }
 
+#[allow(clippy::erasing_op)]
+#[allow(clippy::identity_op)]
 fn example_1(options: opt::Opt) -> libceed::Result<()> {
     // Process command line arguments
     let opt::Opt {
@@ -45,17 +48,20 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> {
         quiet,
         gallery,
     } = options;
-    assert!(dim >= 1 && dim <= 3);
+    assert!((1..=3).contains(&dim));
     assert!(mesh_degree >= 1);
     assert!(solution_degree >= 1);
     assert!(num_qpts >= 1);
     let ncomp_x = dim;
-    let problem_size: i64;
-    if problem_size_requested < 0 {
-        problem_size = if test { 8 * 16 } else { 256 * 1024 };
+    let problem_size: i64 = if problem_size_requested < 0 {
+        if test {
+            8 * 16
+        } else {
+            256 * 1024
+        }
     } else {
-        problem_size = problem_size_requested;
-    }
+        problem_size_requested
+    };
 
     // Summary output
     if !quiet {
@@ -76,10 +82,20 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> {
     let ceed = Ceed::init(&ceed_spec);
 
     // Mesh and solution bases
-    let basis_mesh =
-        ceed.basis_tensor_H1_Lagrange(dim, ncomp_x, mesh_degree + 1, num_qpts, QuadMode::Gauss)?;
-    let basis_solution =
-        ceed.basis_tensor_H1_Lagrange(dim, 1, solution_degree + 1, num_qpts, QuadMode::Gauss)?;
+    let basis_mesh = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_x,
+        mesh_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
+    let basis_solution = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        1,
+        solution_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
 
     // Determine mesh size from approximate problem size
     let num_xyz = mesh::cartesian_mesh_size(dim, solution_degree, problem_size);
@@ -91,7 +107,7 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> {
         if dim > 2 {
             print!(", nz = {}", num_xyz[2]);
         }
-        print!("\n");
+        println!();
     }
 
     // Build ElemRestriction objects describing the mesh and solution discrete
@@ -158,9 +174,9 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_build_closure = ceed
         .q_function_interior(1, Box::new(build_mass))?
-        .input("dx", ncomp_x * dim, EvalMode::Grad)?
-        .input("weights", 1, EvalMode::Weight)?
-        .output("qdata", 1, EvalMode::None)?;
+        .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)?
+        .input("weights", 1, libceed::EvalMode::Weight)?
+        .output("qdata", 1, libceed::EvalMode::None)?;
     // -- QFunction from gallery
     let qf_build_named = {
         let name = format!("Mass{}DBuild", dim);
@@ -205,9 +221,9 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_mass_closure = ceed
         .q_function_interior(1, Box::new(apply_mass))?
-        .input("u", 1, EvalMode::Interp)?
-        .input("qdata", 1, EvalMode::None)?
-        .output("v", 1, EvalMode::Interp)?;
+        .input("u", 1, libceed::EvalMode::Interp)?
+        .input("qdata", 1, libceed::EvalMode::None)?
+        .output("v", 1, libceed::EvalMode::Interp)?;
     // -- QFunction from gallery
     let qf_mass_named = ceed.q_function_interior_by_name("MassApply")?;
     // -- QFunction for use with Operator
@@ -234,7 +250,7 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> {
     op_mass.apply(&u, &mut v)?;
 
     // Compute the mesh volume
-    let volume: Scalar = v.view()?.iter().sum();
+    let volume: libceed::Scalar = v.view()?.iter().sum();
 
     // Output results
     if !quiet {
diff --git a/examples/rust/ex1-volume/src/opt.rs b/examples/rust/ex1-volume/src/opt.rs
index 8fd8b71a0d..c93cd17180 100644
--- a/examples/rust/ex1-volume/src/opt.rs
+++ b/examples/rust/ex1-volume/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -15,7 +15,6 @@ use clap::Parser;
     name = "libCEED Rust Example 1 - Volume",
     about = "This example uses the mass matrix to compute the length, area, or volume of a region, depending upon runtime parameters."
 )]
-#[cfg(not(tarpaulin_include))]
 pub(crate) struct Opt {
     /// libCEED backend resource to use
     #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")]
diff --git a/examples/rust/ex1-volume/src/transform.rs b/examples/rust/ex1-volume/src/transform.rs
index 6ebe14bc6f..7073937353 100644
--- a/examples/rust/ex1-volume/src/transform.rs
+++ b/examples/rust/ex1-volume/src/transform.rs
@@ -1,27 +1,25 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-use libceed::prelude::*;
-
 // ----------------------------------------------------------------------------
 // Transform mesh coordinates
 // ----------------------------------------------------------------------------
 pub(crate) fn transform_mesh_coordinates(
     dim: usize,
     mesh_size: usize,
-    mesh_coords: &mut Vector,
-) -> libceed::Result<Scalar> {
+    mesh_coords: &mut libceed::Vector,
+) -> libceed::Result<libceed::Scalar> {
     // Transform coordinates
     if dim == 1 {
         for coord in mesh_coords.view_mut()?.iter_mut() {
             // map [0,1] to [0,1] varying the mesh density
             *coord = 0.5
-                + 1.0 / (3.0 as Scalar).sqrt()
-                    * ((2.0 / 3.0) * std::f64::consts::PI as Scalar * (*coord - 0.5)).sin()
+                + 1.0 / (3.0 as libceed::Scalar).sqrt()
+                    * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)).sin()
         }
     } else {
         let mut coords = mesh_coords.view_mut()?;
@@ -30,7 +28,7 @@ pub(crate) fn transform_mesh_coordinates(
             // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
             // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
             let u = 1.0 + coords[i];
-            let v = std::f64::consts::PI as Scalar / 2.0 * coords[i + num_nodes];
+            let v = std::f64::consts::PI as libceed::Scalar / 2.0 * coords[i + num_nodes];
             coords[i] = u * v.cos();
             coords[i + num_nodes] = u * v.sin();
         }
@@ -39,7 +37,8 @@ pub(crate) fn transform_mesh_coordinates(
     // Exact volume of transformed region
     let exact_volume = match dim {
         1 => 1.0,
-        _ => 3.0 / 4.0 * std::f64::consts::PI as Scalar,
+        2 | 3 => 3.0 / 4.0 * std::f64::consts::PI as libceed::Scalar,
+        _ => unreachable!(),
     };
     Ok(exact_volume)
 }
diff --git a/examples/rust/ex4-vector-surface/.gitignore b/examples/rust/ex2-surface-vector/.gitignore
similarity index 100%
rename from examples/rust/ex4-vector-surface/.gitignore
rename to examples/rust/ex2-surface-vector/.gitignore
diff --git a/examples/rust/ex4-vector-surface/Cargo.toml b/examples/rust/ex2-surface-vector/Cargo.toml
similarity index 91%
rename from examples/rust/ex4-vector-surface/Cargo.toml
rename to examples/rust/ex2-surface-vector/Cargo.toml
index 6b41826088..4eac55c52e 100644
--- a/examples/rust/ex4-vector-surface/Cargo.toml
+++ b/examples/rust/ex2-surface-vector/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "ex4-vector-surface"
+name = "ex2-surface-vector"
 version = "0.11.0"
 authors = [
     "Jeremy L Thompson <thompson.jeremy.luke@gmail.com>",
diff --git a/examples/rust/ex4-vector-surface/src/main.rs b/examples/rust/ex2-surface-vector/src/main.rs
similarity index 87%
rename from examples/rust/ex4-vector-surface/src/main.rs
rename to examples/rust/ex2-surface-vector/src/main.rs
index 5847d8033f..e2ff598d2e 100644
--- a/examples/rust/ex4-vector-surface/src/main.rs
+++ b/examples/rust/ex2-surface-vector/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -21,20 +21,23 @@
 // line argument (-ceed).
 
 use clap::Parser;
-use libceed::{prelude::*, Ceed};
+use libceed::{
+    BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt,
+};
 mod opt;
 mod transform;
 
 // ----------------------------------------------------------------------------
 // Example 4
 // ----------------------------------------------------------------------------
-#[cfg(not(tarpaulin_include))]
 fn main() -> libceed::Result<()> {
     let options = opt::Opt::parse();
-    example_4(options)
+    example_2_vector(options)
 }
 
-fn example_4(options: opt::Opt) -> libceed::Result<()> {
+#[allow(clippy::erasing_op)]
+#[allow(clippy::identity_op)]
+fn example_2_vector(options: opt::Opt) -> libceed::Result<()> {
     // Process command line arguments
     let opt::Opt {
         ceed_spec,
@@ -47,21 +50,20 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
         quiet,
         gallery,
     } = options;
-    assert!(dim >= 1 && dim <= 3);
+    assert!((0..=3).contains(&dim));
     assert!(mesh_degree >= 1);
     assert!(solution_degree >= 1);
     assert!(num_qpts >= 1);
     let ncomp_x = dim;
-    let problem_size: i64;
-    if problem_size_requested < 0 {
-        problem_size = if test {
+    let problem_size: i64 = if problem_size_requested < 0 {
+        if test {
             16 * 16 * (dim * dim) as i64
         } else {
             256 * 1024
-        };
+        }
     } else {
-        problem_size = problem_size_requested;
-    }
+        problem_size_requested
+    };
     let ncomp_u = 3;
 
     // Summary output
@@ -83,14 +85,19 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
     let ceed = Ceed::init(&ceed_spec);
 
     // Mesh and solution bases
-    let basis_mesh =
-        ceed.basis_tensor_H1_Lagrange(dim, ncomp_x, mesh_degree + 1, num_qpts, QuadMode::Gauss)?;
+    let basis_mesh = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_x,
+        mesh_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
     let basis_solution = ceed.basis_tensor_H1_Lagrange(
         dim,
         ncomp_u,
         solution_degree + 1,
         num_qpts,
-        QuadMode::Gauss,
+        libceed::QuadMode::Gauss,
     )?;
 
     // Determine mesh size from approximate problem size
@@ -103,7 +110,7 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
         if dim > 2 {
             print!(", nz = {}", num_xyz[2]);
         }
-        print!("\n");
+        println!();
     }
 
     // Build ElemRestriction objects describing the mesh and solution discrete
@@ -207,9 +214,9 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_build_closure = ceed
         .q_function_interior(1, Box::new(build_diff))?
-        .input("dx", ncomp_x * dim, EvalMode::Grad)?
-        .input("weights", 1, EvalMode::Weight)?
-        .output("qdata", dim * (dim + 1) / 2, EvalMode::None)?;
+        .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)?
+        .input("weights", 1, libceed::EvalMode::Weight)?
+        .output("qdata", dim * (dim + 1) / 2, libceed::EvalMode::None)?;
     // -- QFunction from gallery
     let qf_build_named = {
         let name = format!("Poisson{}DBuild", dim);
@@ -249,7 +256,7 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
         match dim {
             1 => {
                 let q = qdata.len();
-                for c in 0..3 {
+                for c in 0..ncomp_u {
                     vg.iter_mut()
                         .skip(c * q)
                         .zip(ug.iter().skip(c * q).zip(qdata.iter()))
@@ -259,12 +266,12 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
             2 => {
                 let q = qdata.len() / 3;
                 for i in 0..q {
+                    let dxdxdxdx_t = [
+                        [qdata[i + 0 * q], qdata[i + 2 * q]],
+                        [qdata[i + 2 * q], qdata[i + 1 * q]],
+                    ];
                     for c in 0..ncomp_u {
                         let du = [ug[i + (c + 0 * ncomp_u) * q], ug[i + (c + 1 * ncomp_u) * q]];
-                        let dxdxdxdx_t = [
-                            [qdata[i + 0 * q], qdata[i + 2 * q]],
-                            [qdata[i + 2 * q], qdata[i + 1 * q]],
-                        ];
                         for j in 0..dim {
                             vg[i + (c + j * ncomp_u) * q] =
                                 du[0] * dxdxdxdx_t[0][j] + du[1] * dxdxdxdx_t[1][j];
@@ -275,17 +282,17 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
             3 => {
                 let q = qdata.len() / 6;
                 for i in 0..q {
+                    let dxdxdxdx_t = [
+                        [qdata[i + 0 * q], qdata[i + 5 * q], qdata[i + 4 * q]],
+                        [qdata[i + 5 * q], qdata[i + 1 * q], qdata[i + 3 * q]],
+                        [qdata[i + 4 * q], qdata[i + 3 * q], qdata[i + 2 * q]],
+                    ];
                     for c in 0..ncomp_u {
                         let du = [
                             ug[i + (c + 0 * ncomp_u) * q],
                             ug[i + (c + 1 * ncomp_u) * q],
                             ug[i + (c + 2 * ncomp_u) * q],
                         ];
-                        let dxdxdxdx_t = [
-                            [qdata[i + 0 * q], qdata[i + 5 * q], qdata[i + 4 * q]],
-                            [qdata[i + 5 * q], qdata[i + 1 * q], qdata[i + 3 * q]],
-                            [qdata[i + 4 * q], qdata[i + 3 * q], qdata[i + 2 * q]],
-                        ];
                         for j in 0..dim {
                             vg[i + (c + j * ncomp_u) * q] = du[0] * dxdxdxdx_t[0][j]
                                 + du[1] * dxdxdxdx_t[1][j]
@@ -302,9 +309,9 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_diff_closure = ceed
         .q_function_interior(1, Box::new(apply_diff))?
-        .input("du", dim * ncomp_u, EvalMode::Grad)?
-        .input("qdata", dim * (dim + 1) / 2, EvalMode::None)?
-        .output("dv", dim * ncomp_u, EvalMode::Grad)?;
+        .input("du", dim * ncomp_u, libceed::EvalMode::Grad)?
+        .input("qdata", dim * (dim + 1) / 2, libceed::EvalMode::None)?
+        .output("dv", dim * ncomp_u, libceed::EvalMode::Grad)?;
     // -- QFunction from gallery
     let qf_diff_named = {
         let name = format!("Vector3Poisson{}DApply", dim);
@@ -350,7 +357,7 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> {
     op_diff.apply(&u, &mut v)?;
 
     // Compute the mesh surface area
-    let area: Scalar = v
+    let area: libceed::Scalar = v
         .view()?
         .iter()
         .map(|v| (*v).abs())
@@ -388,7 +395,7 @@ mod tests {
     use super::*;
 
     #[test]
-    fn example_4_1d() {
+    fn example_2_vector_1d() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 1,
@@ -400,11 +407,11 @@ mod tests {
             quiet: true,
             gallery: false,
         };
-        assert!(example_4(options).is_ok());
+        assert!(example_2_vector(options).is_ok());
     }
 
     #[test]
-    fn example_4_2d() {
+    fn example_2_vector_2d() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 2,
@@ -416,11 +423,11 @@ mod tests {
             quiet: true,
             gallery: false,
         };
-        assert!(example_4(options).is_ok());
+        assert!(example_2_vector(options).is_ok());
     }
 
     #[test]
-    fn example_4_3d() {
+    fn example_2_vector_3d() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 3,
@@ -432,11 +439,11 @@ mod tests {
             quiet: false,
             gallery: false,
         };
-        assert!(example_4(options).is_ok());
+        assert!(example_2_vector(options).is_ok());
     }
 
     #[test]
-    fn example_4_1d_gallery() {
+    fn example_2_vector_1d_gallery() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 1,
@@ -448,11 +455,11 @@ mod tests {
             quiet: true,
             gallery: true,
         };
-        assert!(example_4(options).is_ok());
+        assert!(example_2_vector(options).is_ok());
     }
 
     #[test]
-    fn example_4_2d_gallery() {
+    fn example_2_vector_2d_gallery() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 2,
@@ -464,11 +471,11 @@ mod tests {
             quiet: true,
             gallery: true,
         };
-        assert!(example_4(options).is_ok());
+        assert!(example_2_vector(options).is_ok());
     }
 
     #[test]
-    fn example_4_3d_gallery() {
+    fn example_2_vector_3d_gallery() {
         let options = opt::Opt {
             ceed_spec: "/cpu/self/ref/serial".to_string(),
             dim: 3,
@@ -480,7 +487,7 @@ mod tests {
             quiet: true,
             gallery: true,
         };
-        assert!(example_4(options).is_ok());
+        assert!(example_2_vector(options).is_ok());
     }
 }
 
diff --git a/examples/rust/ex4-vector-surface/src/opt.rs b/examples/rust/ex2-surface-vector/src/opt.rs
similarity index 95%
rename from examples/rust/ex4-vector-surface/src/opt.rs
rename to examples/rust/ex2-surface-vector/src/opt.rs
index 8f58427120..ecbeb8c3cc 100644
--- a/examples/rust/ex4-vector-surface/src/opt.rs
+++ b/examples/rust/ex2-surface-vector/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -15,7 +15,6 @@ use clap::Parser;
     name = "libCEED Rust Example 4 - Vector Surface Area",
     about = "This example illustrates a simple usage of libCEED to compute the surface area of a body using matrix-free application of a 3 component vector diffusion operator."
 )]
-#[cfg(not(tarpaulin_include))]
 pub(crate) struct Opt {
     /// libCEED backend resource to use
     #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")]
diff --git a/examples/rust/ex4-vector-surface/src/transform.rs b/examples/rust/ex2-surface-vector/src/transform.rs
similarity index 71%
rename from examples/rust/ex4-vector-surface/src/transform.rs
rename to examples/rust/ex2-surface-vector/src/transform.rs
index 085d9bc94d..43cdbfe0f1 100644
--- a/examples/rust/ex4-vector-surface/src/transform.rs
+++ b/examples/rust/ex2-surface-vector/src/transform.rs
@@ -1,32 +1,31 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-use libceed::prelude::*;
-
 // ----------------------------------------------------------------------------
 // Transform mesh coordinates
 // ----------------------------------------------------------------------------
 pub(crate) fn transform_mesh_coordinates(
     dim: usize,
-    mesh_coords: &mut Vector,
-) -> libceed::Result<Scalar> {
+    mesh_coords: &mut libceed::Vector,
+) -> libceed::Result<libceed::Scalar> {
     // Transform coordinates
     for coord in mesh_coords.view_mut()?.iter_mut() {
         // map [0,1] to [0,1] varying the mesh density
         *coord = 0.5
-            + 1.0 / (3.0 as Scalar).sqrt()
-                * ((2.0 / 3.0) * std::f64::consts::PI as Scalar * (*coord - 0.5)).sin()
+            + 1.0 / (3.0 as libceed::Scalar).sqrt()
+                * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)).sin()
     }
 
     // Exact surface area of transformed region
     let exact_area = match dim {
         1 => 2.0,
         2 => 4.0,
-        _ => 6.0,
+        3 => 6.0,
+        _ => unreachable!(),
     };
     Ok(exact_area)
 }
diff --git a/examples/rust/ex2-surface/src/main.rs b/examples/rust/ex2-surface/src/main.rs
index 02349bc666..ee66c4663d 100644
--- a/examples/rust/ex2-surface/src/main.rs
+++ b/examples/rust/ex2-surface/src/main.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -20,19 +20,22 @@
 // line argument (-ceed).
 
 use clap::Parser;
-use libceed::{prelude::*, Ceed};
+use libceed::{
+    BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt,
+};
 mod opt;
 mod transform;
 
 // ----------------------------------------------------------------------------
 // Example 2
 // ----------------------------------------------------------------------------
-#[cfg(not(tarpaulin_include))]
 fn main() -> libceed::Result<()> {
     let options = opt::Opt::parse();
     example_2(options)
 }
 
+#[allow(clippy::erasing_op)]
+#[allow(clippy::identity_op)]
 fn example_2(options: opt::Opt) -> libceed::Result<()> {
     // Process command line arguments
     let opt::Opt {
@@ -46,21 +49,20 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
         quiet,
         gallery,
     } = options;
-    assert!(dim >= 1 && dim <= 3);
+    assert!((0..=3).contains(&dim));
     assert!(mesh_degree >= 1);
     assert!(solution_degree >= 1);
     assert!(num_qpts >= 1);
     let ncomp_x = dim;
-    let problem_size: i64;
-    if problem_size_requested < 0 {
-        problem_size = if test {
+    let problem_size: i64 = if problem_size_requested < 0 {
+        if test {
             16 * 16 * (dim * dim) as i64
         } else {
             256 * 1024
-        };
+        }
     } else {
-        problem_size = problem_size_requested;
-    }
+        problem_size_requested
+    };
 
     // Summary output
     if !quiet {
@@ -81,10 +83,20 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
     let ceed = Ceed::init(&ceed_spec);
 
     // Mesh and solution bases
-    let basis_mesh =
-        ceed.basis_tensor_H1_Lagrange(dim, ncomp_x, mesh_degree + 1, num_qpts, QuadMode::Gauss)?;
-    let basis_solution =
-        ceed.basis_tensor_H1_Lagrange(dim, 1, solution_degree + 1, num_qpts, QuadMode::Gauss)?;
+    let basis_mesh = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_x,
+        mesh_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
+    let basis_solution = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        1,
+        solution_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
 
     // Determine mesh size from approximate problem size
     let num_xyz = mesh::cartesian_mesh_size(dim, solution_degree, problem_size);
@@ -96,7 +108,7 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
         if dim > 2 {
             print!(", nz = {}", num_xyz[2]);
         }
-        print!("\n");
+        println!();
     }
 
     // Build ElemRestriction objects describing the mesh and solution discrete
@@ -111,7 +123,6 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
         dim * (dim + 1) / 2,
         num_qpts,
     )?;
-
     let (rstr_solution, _) =
         mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, 1, num_qpts)?;
     let mesh_size = rstr_mesh.lvector_size();
@@ -200,9 +211,9 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_build_closure = ceed
         .q_function_interior(1, Box::new(build_diff))?
-        .input("dx", ncomp_x * dim, EvalMode::Grad)?
-        .input("weights", 1, EvalMode::Weight)?
-        .output("qdata", dim * (dim + 1) / 2, EvalMode::None)?;
+        .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)?
+        .input("weights", 1, libceed::EvalMode::Weight)?
+        .output("qdata", dim * (dim + 1) / 2, libceed::EvalMode::None)?;
     // -- QFunction from gallery
     let qf_build_named = {
         let name = format!("Poisson{}DBuild", dim);
@@ -281,9 +292,9 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
     };
     let qf_diff_closure = ceed
         .q_function_interior(1, Box::new(apply_diff))?
-        .input("du", dim, EvalMode::Grad)?
-        .input("qdata", dim * (dim + 1) / 2, EvalMode::None)?
-        .output("dv", dim, EvalMode::Grad)?;
+        .input("du", dim, libceed::EvalMode::Grad)?
+        .input("qdata", dim * (dim + 1) / 2, libceed::EvalMode::None)?
+        .output("dv", dim, libceed::EvalMode::Grad)?;
     // -- QFunction from gallery
     let qf_diff_named = {
         let name = format!("Poisson{}DApply", dim);
@@ -320,7 +331,7 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> {
     op_diff.apply(&u, &mut v)?;
 
     // Compute the mesh surface area
-    let area: Scalar = v.view()?.iter().map(|v| (*v).abs()).sum();
+    let area: libceed::Scalar = v.view()?.iter().map(|v| (*v).abs()).sum();
 
     // Output results
     if !quiet {
diff --git a/examples/rust/ex2-surface/src/opt.rs b/examples/rust/ex2-surface/src/opt.rs
index 13b58f26d7..f2c1afc8f2 100644
--- a/examples/rust/ex2-surface/src/opt.rs
+++ b/examples/rust/ex2-surface/src/opt.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -15,7 +15,6 @@ use clap::Parser;
     name = "libCEED Rust Example 2 - Surface Area",
     about = "This example illustrates a simple usage of libCEED to compute the surface area of a body using matrix-free application of a diffusion operator."
 )]
-#[cfg(not(tarpaulin_include))]
 pub(crate) struct Opt {
     /// libCEED backend resource to use
     #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")]
diff --git a/examples/rust/ex2-surface/src/transform.rs b/examples/rust/ex2-surface/src/transform.rs
index 085d9bc94d..43cdbfe0f1 100644
--- a/examples/rust/ex2-surface/src/transform.rs
+++ b/examples/rust/ex2-surface/src/transform.rs
@@ -1,32 +1,31 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-use libceed::prelude::*;
-
 // ----------------------------------------------------------------------------
 // Transform mesh coordinates
 // ----------------------------------------------------------------------------
 pub(crate) fn transform_mesh_coordinates(
     dim: usize,
-    mesh_coords: &mut Vector,
-) -> libceed::Result<Scalar> {
+    mesh_coords: &mut libceed::Vector,
+) -> libceed::Result<libceed::Scalar> {
     // Transform coordinates
     for coord in mesh_coords.view_mut()?.iter_mut() {
         // map [0,1] to [0,1] varying the mesh density
         *coord = 0.5
-            + 1.0 / (3.0 as Scalar).sqrt()
-                * ((2.0 / 3.0) * std::f64::consts::PI as Scalar * (*coord - 0.5)).sin()
+            + 1.0 / (3.0 as libceed::Scalar).sqrt()
+                * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5)).sin()
     }
 
     // Exact surface area of transformed region
     let exact_area = match dim {
         1 => 2.0,
         2 => 4.0,
-        _ => 6.0,
+        3 => 6.0,
+        _ => unreachable!(),
     };
     Ok(exact_area)
 }
diff --git a/examples/rust/ex3-volume-vector/.gitignore b/examples/rust/ex3-volume-vector/.gitignore
new file mode 100644
index 0000000000..a9d37c560c
--- /dev/null
+++ b/examples/rust/ex3-volume-vector/.gitignore
@@ -0,0 +1,2 @@
+target
+Cargo.lock
diff --git a/examples/rust/ex3-volume-vector/Cargo.toml b/examples/rust/ex3-volume-vector/Cargo.toml
new file mode 100644
index 0000000000..bfbe7241e0
--- /dev/null
+++ b/examples/rust/ex3-volume-vector/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "ex3-volume-vector"
+version = "0.11.0"
+authors = [
+    "Jeremy L Thompson <thompson.jeremy.luke@gmail.com>",
+]
+edition = "2018"
+
+[dependencies]
+clap = { version = "4.0.17", features = ["derive"] }
+libceed = { path = "../../../rust/libceed" }
+mesh = { path = "../mesh" }
+
+[package.metadata.release]
+release = false
diff --git a/examples/rust/ex3-volume-vector/src/main.rs b/examples/rust/ex3-volume-vector/src/main.rs
new file mode 100644
index 0000000000..2eb530470e
--- /dev/null
+++ b/examples/rust/ex3-volume-vector/src/main.rs
@@ -0,0 +1,438 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+//
+//                             libCEED Example 1
+//
+// This example illustrates a simple usage of libCEED to compute the volume of a
+// 3D body using matrix-free application of a mass + diff operator.  Arbitrary
+// mesh and solution orders in 1D, 2D and 3D are supported from the same code.
+// This calculation is executed in triplicate with a 3 component vector system.
+//
+// The example has no dependencies, and is designed to be self-contained. For
+// additional examples that use external discretization libraries (MFEM, PETSc,
+// etc.) see the subdirectories in libceed/examples.
+//
+// All libCEED objects use a Ceed device object constructed based on a command
+// line argument (-ceed).
+
+use clap::Parser;
+use libceed::{
+    BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt,
+};
+mod opt;
+mod transform;
+
+// ----------------------------------------------------------------------------
+// Example 1
+// ----------------------------------------------------------------------------
+fn main() -> libceed::Result<()> {
+    let options = opt::Opt::parse();
+    example_3_vector(options)
+}
+
+#[allow(clippy::erasing_op)]
+#[allow(clippy::identity_op)]
+fn example_3_vector(options: opt::Opt) -> libceed::Result<()> {
+    // Process command line arguments
+    let opt::Opt {
+        ceed_spec,
+        dim,
+        mesh_degree,
+        solution_degree,
+        num_qpts,
+        problem_size_requested,
+        test,
+        quiet,
+    } = options;
+    assert!((1..=3).contains(&dim));
+    assert!(mesh_degree >= 1);
+    assert!(solution_degree >= 1);
+    assert!(num_qpts >= 1);
+    let ncomp_x = dim;
+    let problem_size: i64 = if problem_size_requested < 0 {
+        if test {
+            8 * 16
+        } else {
+            256 * 1024
+        }
+    } else {
+        problem_size_requested
+    };
+    let ncomp_u = 3;
+
+    // Summary output
+    if !quiet {
+        println!("Selected options: [command line option] : <current value>");
+        println!("    Ceed specification [-c] : {}", ceed_spec);
+        println!("    Mesh dimension     [-d] : {}", dim);
+        println!("    Mesh degree        [-m] : {}", mesh_degree);
+        println!("    Solution degree    [-p] : {}", solution_degree);
+        println!("    Num. 1D quadr. pts [-q] : {}", num_qpts);
+        println!("    Approx. # unknowns [-s] : {}", problem_size);
+        println!("    QFunction source        : user closure");
+    }
+
+    // Initalize ceed context
+    let ceed = Ceed::init(&ceed_spec);
+
+    // Mesh and solution bases
+    let basis_mesh = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_x,
+        mesh_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
+    let basis_solution = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_u,
+        solution_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
+
+    // Determine mesh size from approximate problem size
+    let num_xyz = mesh::cartesian_mesh_size(dim, solution_degree, problem_size);
+    if !quiet {
+        print!("\nMesh size                   : nx = {}", num_xyz[0]);
+        if dim > 1 {
+            print!(", ny = {}", num_xyz[1]);
+        }
+        if dim > 2 {
+            print!(", nz = {}", num_xyz[2]);
+        }
+        println!();
+    }
+
+    // Build ElemRestriction objects describing the mesh and solution discrete
+    // representations
+    let (rstr_mesh, _) =
+        mesh::build_cartesian_restriction(&ceed, dim, num_xyz, mesh_degree, ncomp_x, num_qpts)?;
+    let (_, rstr_qdata) = mesh::build_cartesian_restriction(
+        &ceed,
+        dim,
+        num_xyz,
+        solution_degree,
+        1 + dim * (dim + 1) / 2,
+        num_qpts,
+    )?;
+    let (rstr_solution, _) =
+        mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, ncomp_u, num_qpts)?;
+    let mesh_size = rstr_mesh.lvector_size();
+    let solution_size = rstr_solution.lvector_size();
+    if !quiet {
+        println!("Number of mesh nodes        : {}", mesh_size / dim);
+        println!("Number of solution nodes    : {}", solution_size);
+    }
+
+    // Create a Vector with the mesh coordinates
+    let mut mesh_coords = mesh::cartesian_mesh_coords(&ceed, dim, num_xyz, mesh_degree, mesh_size)?;
+
+    // Apply a transformation to the mesh coordinates
+    let exact_volume = transform::transform_mesh_coordinates(dim, mesh_size, &mut mesh_coords)?;
+
+    // QFunction that builds the quadrature data for the mass + diff operator
+    // -- QFunction from user closure
+    let build_mass_diff = move |[jacobian, weights, ..]: QFunctionInputs,
+                                [qdata, ..]: QFunctionOutputs| {
+        // Build quadrature data
+        match dim {
+            1 => {
+                let q = qdata.len() / 2;
+                for i in 0..q {
+                    // Mass
+                    qdata[i + q * 0] = weights[i] * jacobian[i];
+                    // Diff
+                    qdata[i + q * 1] = weights[i] / jacobian[i];
+                }
+            }
+            2 => {
+                let q = qdata.len() / 4;
+                for i in 0..q {
+                    let j11 = jacobian[i + q * 0];
+                    let j21 = jacobian[i + q * 1];
+                    let j12 = jacobian[i + q * 2];
+                    let j22 = jacobian[i + q * 3];
+                    // Mass
+                    qdata[i + q * 0] = weights[i] * (j11 * j22 - j21 * j12);
+                    // Diff
+                    let qw = weights[i] / (j11 * j22 - j21 * j12);
+                    qdata[i + q * 1] = qw * (j12 * j12 + j22 * j22);
+                    qdata[i + q * 2] = qw * (j11 * j11 + j21 * j21);
+                    qdata[i + q * 3] = -qw * (j11 * j12 + j21 * j22);
+                }
+            }
+            3 => {
+                let q = qdata.len() / 7;
+                for i in 0..q {
+                    let mut a = [0.0; 9];
+                    for j in 0..3 {
+                        for k in 0..3 {
+                            a[k * 3 + j] = jacobian[i + q * ((j + 1) % 3 + 3 * ((k + 1) % 3))]
+                                * jacobian[i + q * ((j + 2) % 3 + 3 * ((k + 2) % 3))]
+                                - jacobian[i + q * ((j + 1) % 3 + 3 * ((k + 2) % 3))]
+                                    * jacobian[i + q * ((j + 2) % 3 + 3 * ((k + 1) % 3))];
+                        }
+                    }
+                    // Mass
+                    qdata[i + q * 0] = weights[i]
+                        * (jacobian[i + q * 0] * a[0 * 3 + 0]
+                            + jacobian[i + q * 1] * a[0 * 3 + 1]
+                            + jacobian[i + q * 2] * a[0 * 3 + 2]);
+                    let qw = weights[i]
+                        / (jacobian[i + q * 0] * a[0 * 3 + 0]
+                            + jacobian[i + q * 1] * a[0 * 3 + 1]
+                            + jacobian[i + q * 2] * a[0 * 3 + 2]);
+                    // Diff
+                    qdata[i + q * 1] = qw
+                        * (a[0 * 3 + 0] * a[0 * 3 + 0]
+                            + a[0 * 3 + 1] * a[0 * 3 + 1]
+                            + a[0 * 3 + 2] * a[0 * 3 + 2]);
+                    qdata[i + q * 2] = qw
+                        * (a[1 * 3 + 0] * a[1 * 3 + 0]
+                            + a[1 * 3 + 1] * a[1 * 3 + 1]
+                            + a[1 * 3 + 2] * a[1 * 3 + 2]);
+                    qdata[i + q * 3] = qw
+                        * (a[2 * 3 + 0] * a[2 * 3 + 0]
+                            + a[2 * 3 + 1] * a[2 * 3 + 1]
+                            + a[2 * 3 + 2] * a[2 * 3 + 2]);
+                    qdata[i + q * 4] = qw
+                        * (a[1 * 3 + 0] * a[2 * 3 + 0]
+                            + a[1 * 3 + 1] * a[2 * 3 + 1]
+                            + a[1 * 3 + 2] * a[2 * 3 + 2]);
+                    qdata[i + q * 5] = qw
+                        * (a[0 * 3 + 0] * a[2 * 3 + 0]
+                            + a[0 * 3 + 1] * a[2 * 3 + 1]
+                            + a[0 * 3 + 2] * a[2 * 3 + 2]);
+                    qdata[i + q * 6] = qw
+                        * (a[0 * 3 + 0] * a[1 * 3 + 0]
+                            + a[0 * 3 + 1] * a[1 * 3 + 1]
+                            + a[0 * 3 + 2] * a[1 * 3 + 2]);
+                }
+            }
+            _ => unreachable!(),
+        };
+
+        // Return clean error code
+        0
+    };
+    let qf_build_closure = ceed
+        .q_function_interior(1, Box::new(build_mass_diff))?
+        .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)?
+        .input("weights", 1, libceed::EvalMode::Weight)?
+        .output("qdata", 1 + dim * (dim + 1) / 2, libceed::EvalMode::None)?;
+    // -- QFunction for use with Operator
+    let qf_build = QFunctionOpt::SomeQFunction(&qf_build_closure);
+
+    // Operator that build the quadrature data for the mass + diff operator
+    let op_build = ceed
+        .operator(qf_build, QFunctionOpt::None, QFunctionOpt::None)?
+        .name("build qdata")?
+        .field("dx", &rstr_mesh, &basis_mesh, VectorOpt::Active)?
+        .field(
+            "weights",
+            ElemRestrictionOpt::None,
+            &basis_mesh,
+            VectorOpt::None,
+        )?
+        .field("qdata", &rstr_qdata, BasisOpt::None, VectorOpt::Active)?
+        .check()?;
+
+    // Compute the quadrature data for the mass + diff operator
+    let elem_qpts = num_qpts.pow(dim as u32);
+    let num_elem: usize = num_xyz.iter().take(dim).product();
+    let mut qdata = ceed.vector(num_elem * elem_qpts * (1 + dim * (dim + 1) / 2))?;
+    op_build.apply(&mesh_coords, &mut qdata)?;
+
+    // QFunction that applies the mass + diff operator
+    // -- QFunction from user closure
+    let apply_mass_diff = move |[u, ug, qdata, ..]: QFunctionInputs,
+                                [v, vg, ..]: QFunctionOutputs| {
+        // Apply diffusion operator
+        match dim {
+            1 => {
+                let q = qdata.len() / 2;
+                for i in 0..q {
+                    for c in 0..ncomp_u {
+                        // Mass
+                        v[i + c * q] = u[i + c * q] * qdata[i + 0 * q];
+                        // Diff
+                        vg[i + c * q] = ug[i + c * q] * qdata[i + 1 * q];
+                    }
+                }
+            }
+            2 => {
+                let q = qdata.len() / 4;
+                for i in 0..q {
+                    let dxdxdxdx_t = [
+                        [qdata[i + 1 * q], qdata[i + 3 * q]],
+                        [qdata[i + 3 * q], qdata[i + 2 * q]],
+                    ];
+                    for c in 0..ncomp_u {
+                        // Mass
+                        v[i + c * q] = u[i + c * q] * qdata[i + 0 * q];
+                        // Diff
+                        let du = [ug[i + (c + 0 * ncomp_u) * q], ug[i + (c + 1 * ncomp_u) * q]];
+                        for j in 0..2 {
+                            vg[i + (j + j * ncomp_u) * q] =
+                                du[0] * dxdxdxdx_t[0][j] + du[1] * dxdxdxdx_t[1][j];
+                        }
+                    }
+                }
+            }
+            3 => {
+                let q = qdata.len() / 7;
+                for i in 0..q {
+                    let dxdxdxdx_t = [
+                        [qdata[i + 1 * q], qdata[i + 6 * q], qdata[i + 5 * q]],
+                        [qdata[i + 6 * q], qdata[i + 2 * q], qdata[i + 4 * q]],
+                        [qdata[i + 5 * q], qdata[i + 4 * q], qdata[i + 3 * q]],
+                    ];
+                    for c in 0..ncomp_u {
+                        // Mass
+                        v[i + c * q] = u[i + c * q] * qdata[i + 0 * q];
+                        // Diff
+                        let du = [
+                            ug[i + (c + 0 * ncomp_u) * q],
+                            ug[i + (c + 1 * ncomp_u) * q],
+                            ug[i + (c + 2 * ncomp_u) * q],
+                        ];
+                        for j in 0..3 {
+                            vg[i + (c + j * ncomp_u) * q] = du[0] * dxdxdxdx_t[0][j]
+                                + du[1] * dxdxdxdx_t[1][j]
+                                + du[2] * dxdxdxdx_t[2][j];
+                        }
+                    }
+                }
+            }
+            _ => unreachable!(),
+        };
+
+        // Return clean error code
+        0
+    };
+    let qf_mass_diff_closure = ceed
+        .q_function_interior(1, Box::new(apply_mass_diff))?
+        .input("u", ncomp_u, libceed::EvalMode::Interp)?
+        .input("du", dim * ncomp_u, libceed::EvalMode::Grad)?
+        .input("qdata", 1 + dim * (dim + 1) / 2, libceed::EvalMode::None)?
+        .output("v", ncomp_u, libceed::EvalMode::Interp)?
+        .output("dv", dim * ncomp_u, libceed::EvalMode::Grad)?;
+    // -- QFunction for use with Operator
+    let qf_mass_diff = QFunctionOpt::SomeQFunction(&qf_mass_diff_closure);
+
+    // Mass + diff Operator
+    let op_mass_diff = ceed
+        .operator(qf_mass_diff, QFunctionOpt::None, QFunctionOpt::None)?
+        .name("mass diff")?
+        .field("u", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .field("du", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .field("qdata", &rstr_qdata, BasisOpt::None, &qdata)?
+        .field("v", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .field("dv", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .check()?;
+
+    // Solution vectors
+    let mut u = ceed.vector(solution_size)?;
+    let mut v = ceed.vector(solution_size)?;
+
+    // Initialize u with component index
+    u.set_value(0.0)?;
+    for c in 0..ncomp_u {
+        let q = solution_size / ncomp_u;
+        u.view_mut()?.iter_mut().skip(c * q).take(q).for_each(|u| {
+            *u = (c + 1) as libceed::Scalar;
+        });
+    }
+
+    // Apply the mass + diff operator
+    op_mass_diff.apply(&u, &mut v)?;
+
+    // Compute the mesh volume
+    let volume: libceed::Scalar = v.view()?.iter().sum::<libceed::Scalar>()
+        / ((ncomp_u * (ncomp_u + 1)) / 2) as libceed::Scalar;
+
+    // Output results
+    if !quiet {
+        println!("Exact mesh volume           : {:.12}", exact_volume);
+        println!("Computed mesh volume        : {:.12}", volume);
+        println!(
+            "Volume error                : {:.12e}",
+            volume - exact_volume
+        );
+    }
+    let tolerance = match dim {
+        1 => 200.0 * libceed::EPSILON,
+        _ => 1E-5,
+    };
+    let error = (volume - exact_volume).abs();
+    if error > tolerance {
+        println!("Volume error too large: {:.12e}", error);
+        return Err(libceed::Error {
+            message: format!(
+                "Volume error too large - expected: {:.12e}, actual: {:.12e}",
+                tolerance, error
+            ),
+        });
+    }
+    Ok(())
+}
+
+// ----------------------------------------------------------------------------
+// Tests
+// ----------------------------------------------------------------------------
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn example_3_vector_1d() {
+        let options = opt::Opt {
+            ceed_spec: "/cpu/self/ref/serial".to_string(),
+            dim: 1,
+            mesh_degree: 4,
+            solution_degree: 4,
+            num_qpts: 6,
+            problem_size_requested: -1,
+            test: true,
+            quiet: true,
+        };
+        assert!(example_3_vector(options).is_ok());
+    }
+
+    #[test]
+    fn example_3_vector_2d() {
+        let options = opt::Opt {
+            ceed_spec: "/cpu/self/ref/serial".to_string(),
+            dim: 2,
+            mesh_degree: 4,
+            solution_degree: 4,
+            num_qpts: 6,
+            problem_size_requested: -1,
+            test: true,
+            quiet: true,
+        };
+        assert!(example_3_vector(options).is_ok());
+    }
+
+    #[test]
+    fn example_3_vector_vector_3d() {
+        let options = opt::Opt {
+            ceed_spec: "/cpu/self/ref/serial".to_string(),
+            dim: 3,
+            mesh_degree: 4,
+            solution_degree: 4,
+            num_qpts: 6,
+            problem_size_requested: -1,
+            test: true,
+            quiet: false,
+        };
+        assert!(example_3_vector(options).is_ok());
+    }
+}
+
+// ----------------------------------------------------------------------------
diff --git a/examples/rust/ex3-volume-vector/src/opt.rs b/examples/rust/ex3-volume-vector/src/opt.rs
new file mode 100644
index 0000000000..edf546b032
--- /dev/null
+++ b/examples/rust/ex3-volume-vector/src/opt.rs
@@ -0,0 +1,45 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+use clap::Parser;
+
+// ----------------------------------------------------------------------------
+// Command line arguments
+// ----------------------------------------------------------------------------
+#[derive(Debug, Parser)]
+#[command(
+    name = "libCEED Rust Example 3 - Volume",
+    about = "This example uses the mass matrix and diffusion matrices to compute the length, area, or volume of a region, depending upon runtime parameters."
+)]
+pub(crate) struct Opt {
+    /// libCEED backend resource to use
+    #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")]
+    pub(crate) ceed_spec: String,
+    /// Mesh dimension
+    #[arg(short, long = "dimension", default_value = "3")]
+    pub(crate) dim: usize,
+    /// Polynomial degree for the mesh
+    #[arg(short, long, default_value = "4")]
+    pub(crate) mesh_degree: usize,
+    /// Polynomial degree for the solution
+    #[arg(short = 'p', long, default_value = "4")]
+    pub(crate) solution_degree: usize,
+    /// Number of quadrature points in 1D
+    #[arg(short = 'q', long, default_value = "6")]
+    pub(crate) num_qpts: usize,
+    /// Approximate problem size
+    #[arg(name = "DoF", short = 's', long = "problem_size", default_value = "-1")]
+    pub(crate) problem_size_requested: i64,
+    /// Test mode
+    #[arg(short, long)]
+    pub(crate) test: bool,
+    /// Quiet mode
+    #[arg(short = 'x', long)]
+    pub(crate) quiet: bool,
+}
+
+// ----------------------------------------------------------------------------
diff --git a/examples/rust/ex3-volume-vector/src/transform.rs b/examples/rust/ex3-volume-vector/src/transform.rs
new file mode 100644
index 0000000000..e022a34860
--- /dev/null
+++ b/examples/rust/ex3-volume-vector/src/transform.rs
@@ -0,0 +1,50 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// ----------------------------------------------------------------------------
+// Transform mesh coordinates
+// ----------------------------------------------------------------------------
+pub(crate) fn transform_mesh_coordinates(
+    dim: usize,
+    mesh_size: usize,
+    mesh_coords: &mut libceed::Vector,
+) -> libceed::Result<libceed::Scalar> {
+    // Transform coordinates
+    match dim {
+        1 => {
+            for coord in mesh_coords.view_mut()?.iter_mut() {
+                // map [0,1] to [0,1] varying the mesh density
+                *coord = 0.5
+                    + 1.0 / (3.0 as libceed::Scalar).sqrt()
+                        * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5))
+                            .sin()
+            }
+        }
+        _ => {
+            let num_nodes = mesh_size / dim;
+            let mut coords = mesh_coords.view_mut()?;
+            for i in 0..num_nodes {
+                // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
+                // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
+                let u = coords[i] + 1.;
+                let v = coords[i + num_nodes] * std::f64::consts::PI / 2.;
+                coords[i] = u * v.cos();
+                coords[i + num_nodes] = u * v.sin();
+            }
+        }
+    }
+
+    // Exact volume of transformed region
+    let exact_volume = match dim {
+        1 => 1.,
+        2 | 3 => 3. / 4. * std::f64::consts::PI,
+        _ => unreachable!(),
+    };
+    Ok(exact_volume)
+}
+
+// ----------------------------------------------------------------------------
diff --git a/examples/rust/ex3-volume/.gitignore b/examples/rust/ex3-volume/.gitignore
new file mode 100644
index 0000000000..a9d37c560c
--- /dev/null
+++ b/examples/rust/ex3-volume/.gitignore
@@ -0,0 +1,2 @@
+target
+Cargo.lock
diff --git a/examples/rust/ex3-volume/Cargo.toml b/examples/rust/ex3-volume/Cargo.toml
new file mode 100644
index 0000000000..0c2f979c6b
--- /dev/null
+++ b/examples/rust/ex3-volume/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "ex3-volume"
+version = "0.11.0"
+authors = [
+    "Jeremy L Thompson <thompson.jeremy.luke@gmail.com>",
+]
+edition = "2018"
+
+[dependencies]
+clap = { version = "4.0.17", features = ["derive"] }
+libceed = { path = "../../../rust/libceed" }
+mesh = { path = "../mesh" }
+
+[package.metadata.release]
+release = false
diff --git a/examples/rust/ex3-volume/src/main.rs b/examples/rust/ex3-volume/src/main.rs
new file mode 100644
index 0000000000..16c3dfcfa3
--- /dev/null
+++ b/examples/rust/ex3-volume/src/main.rs
@@ -0,0 +1,415 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+//
+//                             libCEED Example 1
+//
+// This example illustrates a simple usage of libCEED to compute the volume of a
+// 3D body using matrix-free application of a mass + diff operator.  Arbitrary
+// mesh and solution orders in 1D, 2D and 3D are supported from the same code.
+//
+// The example has no dependencies, and is designed to be self-contained. For
+// additional examples that use external discretization libraries (MFEM, PETSc,
+// etc.) see the subdirectories in libceed/examples.
+//
+// All libCEED objects use a Ceed device object constructed based on a command
+// line argument (-ceed).
+
+use clap::Parser;
+use libceed::{
+    BasisOpt, Ceed, ElemRestrictionOpt, QFunctionInputs, QFunctionOpt, QFunctionOutputs, VectorOpt,
+};
+mod opt;
+mod transform;
+
+// ----------------------------------------------------------------------------
+// Example 1
+// ----------------------------------------------------------------------------
+fn main() -> libceed::Result<()> {
+    let options = opt::Opt::parse();
+    example_3(options)
+}
+
+#[allow(clippy::erasing_op)]
+#[allow(clippy::identity_op)]
+fn example_3(options: opt::Opt) -> libceed::Result<()> {
+    // Process command line arguments
+    let opt::Opt {
+        ceed_spec,
+        dim,
+        mesh_degree,
+        solution_degree,
+        num_qpts,
+        problem_size_requested,
+        test,
+        quiet,
+    } = options;
+    assert!((1..=3).contains(&dim));
+    assert!(mesh_degree >= 1);
+    assert!(solution_degree >= 1);
+    assert!(num_qpts >= 1);
+    let ncomp_x = dim;
+    let problem_size: i64 = if problem_size_requested < 0 {
+        if test {
+            8 * 16
+        } else {
+            256 * 1024
+        }
+    } else {
+        problem_size_requested
+    };
+
+    // Summary output
+    if !quiet {
+        println!("Selected options: [command line option] : <current value>");
+        println!("    Ceed specification [-c] : {}", ceed_spec);
+        println!("    Mesh dimension     [-d] : {}", dim);
+        println!("    Mesh degree        [-m] : {}", mesh_degree);
+        println!("    Solution degree    [-p] : {}", solution_degree);
+        println!("    Num. 1D quadr. pts [-q] : {}", num_qpts);
+        println!("    Approx. # unknowns [-s] : {}", problem_size);
+        println!("    QFunction source        : user closure");
+    }
+
+    // Initalize ceed context
+    let ceed = Ceed::init(&ceed_spec);
+
+    // Mesh and solution bases
+    let basis_mesh = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        ncomp_x,
+        mesh_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
+    let basis_solution = ceed.basis_tensor_H1_Lagrange(
+        dim,
+        1,
+        solution_degree + 1,
+        num_qpts,
+        libceed::QuadMode::Gauss,
+    )?;
+
+    // Determine mesh size from approximate problem size
+    let num_xyz = mesh::cartesian_mesh_size(dim, solution_degree, problem_size);
+    if !quiet {
+        print!("\nMesh size                   : nx = {}", num_xyz[0]);
+        if dim > 1 {
+            print!(", ny = {}", num_xyz[1]);
+        }
+        if dim > 2 {
+            print!(", nz = {}", num_xyz[2]);
+        }
+        println!();
+    }
+
+    // Build ElemRestriction objects describing the mesh and solution discrete
+    // representations
+    let (rstr_mesh, _) =
+        mesh::build_cartesian_restriction(&ceed, dim, num_xyz, mesh_degree, ncomp_x, num_qpts)?;
+    let (_, rstr_qdata) = mesh::build_cartesian_restriction(
+        &ceed,
+        dim,
+        num_xyz,
+        solution_degree,
+        1 + dim * (dim + 1) / 2,
+        num_qpts,
+    )?;
+    let (rstr_solution, _) =
+        mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, 1, num_qpts)?;
+    let mesh_size = rstr_mesh.lvector_size();
+    let solution_size = rstr_solution.lvector_size();
+    if !quiet {
+        println!("Number of mesh nodes        : {}", mesh_size / dim);
+        println!("Number of solution nodes    : {}", solution_size);
+    }
+
+    // Create a Vector with the mesh coordinates
+    let mut mesh_coords = mesh::cartesian_mesh_coords(&ceed, dim, num_xyz, mesh_degree, mesh_size)?;
+
+    // Apply a transformation to the mesh coordinates
+    let exact_volume = transform::transform_mesh_coordinates(dim, mesh_size, &mut mesh_coords)?;
+
+    // QFunction that builds the quadrature data for the mass + diff operator
+    // -- QFunction from user closure
+    let build_mass_diff = move |[jacobian, weights, ..]: QFunctionInputs,
+                                [qdata, ..]: QFunctionOutputs| {
+        // Build quadrature data
+        match dim {
+            1 => {
+                let q = qdata.len() / 2;
+                for i in 0..q {
+                    // Mass
+                    qdata[i + q * 0] = weights[i] * jacobian[i];
+                    // Diff
+                    qdata[i + q * 1] = weights[i] / jacobian[i];
+                }
+            }
+            2 => {
+                let q = qdata.len() / 4;
+                for i in 0..q {
+                    let j11 = jacobian[i + q * 0];
+                    let j21 = jacobian[i + q * 1];
+                    let j12 = jacobian[i + q * 2];
+                    let j22 = jacobian[i + q * 3];
+                    // Mass
+                    qdata[i + q * 0] = weights[i] * (j11 * j22 - j21 * j12);
+                    // Diff
+                    let qw = weights[i] / (j11 * j22 - j21 * j12);
+                    qdata[i + q * 1] = qw * (j12 * j12 + j22 * j22);
+                    qdata[i + q * 2] = qw * (j11 * j11 + j21 * j21);
+                    qdata[i + q * 3] = -qw * (j11 * j12 + j21 * j22);
+                }
+            }
+            3 => {
+                let q = qdata.len() / 7;
+                for i in 0..q {
+                    let mut a = [0.0; 9];
+                    for j in 0..3 {
+                        for k in 0..3 {
+                            a[k * 3 + j] = jacobian[i + q * ((j + 1) % 3 + 3 * ((k + 1) % 3))]
+                                * jacobian[i + q * ((j + 2) % 3 + 3 * ((k + 2) % 3))]
+                                - jacobian[i + q * ((j + 1) % 3 + 3 * ((k + 2) % 3))]
+                                    * jacobian[i + q * ((j + 2) % 3 + 3 * ((k + 1) % 3))];
+                        }
+                    }
+                    // Mass
+                    qdata[i + q * 0] = weights[i]
+                        * (jacobian[i + q * 0] * a[0 * 3 + 0]
+                            + jacobian[i + q * 1] * a[0 * 3 + 1]
+                            + jacobian[i + q * 2] * a[0 * 3 + 2]);
+                    let qw = weights[i]
+                        / (jacobian[i + q * 0] * a[0 * 3 + 0]
+                            + jacobian[i + q * 1] * a[0 * 3 + 1]
+                            + jacobian[i + q * 2] * a[0 * 3 + 2]);
+                    // Diff
+                    qdata[i + q * 1] = qw
+                        * (a[0 * 3 + 0] * a[0 * 3 + 0]
+                            + a[0 * 3 + 1] * a[0 * 3 + 1]
+                            + a[0 * 3 + 2] * a[0 * 3 + 2]);
+                    qdata[i + q * 2] = qw
+                        * (a[1 * 3 + 0] * a[1 * 3 + 0]
+                            + a[1 * 3 + 1] * a[1 * 3 + 1]
+                            + a[1 * 3 + 2] * a[1 * 3 + 2]);
+                    qdata[i + q * 3] = qw
+                        * (a[2 * 3 + 0] * a[2 * 3 + 0]
+                            + a[2 * 3 + 1] * a[2 * 3 + 1]
+                            + a[2 * 3 + 2] * a[2 * 3 + 2]);
+                    qdata[i + q * 4] = qw
+                        * (a[1 * 3 + 0] * a[2 * 3 + 0]
+                            + a[1 * 3 + 1] * a[2 * 3 + 1]
+                            + a[1 * 3 + 2] * a[2 * 3 + 2]);
+                    qdata[i + q * 5] = qw
+                        * (a[0 * 3 + 0] * a[2 * 3 + 0]
+                            + a[0 * 3 + 1] * a[2 * 3 + 1]
+                            + a[0 * 3 + 2] * a[2 * 3 + 2]);
+                    qdata[i + q * 6] = qw
+                        * (a[0 * 3 + 0] * a[1 * 3 + 0]
+                            + a[0 * 3 + 1] * a[1 * 3 + 1]
+                            + a[0 * 3 + 2] * a[1 * 3 + 2]);
+                }
+            }
+            _ => unreachable!(),
+        };
+
+        // Return clean error code
+        0
+    };
+    let qf_build_closure = ceed
+        .q_function_interior(1, Box::new(build_mass_diff))?
+        .input("dx", ncomp_x * dim, libceed::EvalMode::Grad)?
+        .input("weights", 1, libceed::EvalMode::Weight)?
+        .output("qdata", 1 + dim * (dim + 1) / 2, libceed::EvalMode::None)?;
+    // -- QFunction for use with Operator
+    let qf_build = QFunctionOpt::SomeQFunction(&qf_build_closure);
+
+    // Operator that build the quadrature data for the mass + diff operator
+    let op_build = ceed
+        .operator(qf_build, QFunctionOpt::None, QFunctionOpt::None)?
+        .name("build qdata")?
+        .field("dx", &rstr_mesh, &basis_mesh, VectorOpt::Active)?
+        .field(
+            "weights",
+            ElemRestrictionOpt::None,
+            &basis_mesh,
+            VectorOpt::None,
+        )?
+        .field("qdata", &rstr_qdata, BasisOpt::None, VectorOpt::Active)?
+        .check()?;
+
+    // Compute the quadrature data for the mass + diff operator
+    let elem_qpts = num_qpts.pow(dim as u32);
+    let num_elem: usize = num_xyz.iter().take(dim).product();
+    let mut qdata = ceed.vector(num_elem * elem_qpts * (1 + dim * (dim + 1) / 2))?;
+    op_build.apply(&mesh_coords, &mut qdata)?;
+
+    // QFunction that applies the mass + diff operator
+    // -- QFunction from user closure
+    let apply_mass_diff = move |[u, ug, qdata, ..]: QFunctionInputs,
+                                [v, vg, ..]: QFunctionOutputs| {
+        // Apply diffusion operator
+        match dim {
+            1 => {
+                let q = qdata.len() / 2;
+                for i in 0..q {
+                    // Mass
+                    v[i] = u[i] * qdata[i + 0 * q];
+                    // Diff
+                    vg[i] = ug[i] * qdata[i + 1 * q];
+                }
+            }
+            2 => {
+                let q = qdata.len() / 4;
+                for i in 0..q {
+                    // Mass
+                    v[i] = u[i] * qdata[i + 0 * q];
+                    // Diff
+                    let du = [ug[i + q * 0], ug[i + q * 1]];
+                    let dxdxdxdx_t = [
+                        [qdata[i + 1 * q], qdata[i + 3 * q]],
+                        [qdata[i + 3 * q], qdata[i + 2 * q]],
+                    ];
+                    for j in 0..2 {
+                        vg[i + j * q] = du[0] * dxdxdxdx_t[0][j] + du[1] * dxdxdxdx_t[1][j];
+                    }
+                }
+            }
+            3 => {
+                let q = qdata.len() / 7;
+                for i in 0..q {
+                    // Mass
+                    v[i] = u[i] * qdata[i + 0 * q];
+                    // Diff
+                    let du = [ug[i + q * 0], ug[i + q * 1], ug[i + q * 2]];
+                    let dxdxdxdx_t = [
+                        [qdata[i + 1 * q], qdata[i + 6 * q], qdata[i + 5 * q]],
+                        [qdata[i + 6 * q], qdata[i + 2 * q], qdata[i + 4 * q]],
+                        [qdata[i + 5 * q], qdata[i + 4 * q], qdata[i + 3 * q]],
+                    ];
+                    for j in 0..3 {
+                        vg[i + j * q] = du[0] * dxdxdxdx_t[0][j]
+                            + du[1] * dxdxdxdx_t[1][j]
+                            + du[2] * dxdxdxdx_t[2][j];
+                    }
+                }
+            }
+            _ => unreachable!(),
+        };
+
+        // Return clean error code
+        0
+    };
+    let qf_mass_diff_closure = ceed
+        .q_function_interior(1, Box::new(apply_mass_diff))?
+        .input("u", 1, libceed::EvalMode::Interp)?
+        .input("du", dim, libceed::EvalMode::Grad)?
+        .input("qdata", 1 + dim * (dim + 1) / 2, libceed::EvalMode::None)?
+        .output("v", 1, libceed::EvalMode::Interp)?
+        .output("dv", dim, libceed::EvalMode::Grad)?;
+    // -- QFunction for use with Operator
+    let qf_mass_diff = QFunctionOpt::SomeQFunction(&qf_mass_diff_closure);
+
+    // Mass + diff Operator
+    let op_mass_diff = ceed
+        .operator(qf_mass_diff, QFunctionOpt::None, QFunctionOpt::None)?
+        .name("mass diff")?
+        .field("u", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .field("du", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .field("qdata", &rstr_qdata, BasisOpt::None, &qdata)?
+        .field("v", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .field("dv", &rstr_solution, &basis_solution, VectorOpt::Active)?
+        .check()?;
+
+    // Solution vectors
+    let u = ceed.vector_from_slice(&vec![1.0; solution_size])?;
+    let mut v = ceed.vector(solution_size)?;
+
+    // Apply the mass + diff operator
+    op_mass_diff.apply(&u, &mut v)?;
+
+    // Compute the mesh volume
+    let volume: libceed::Scalar = v.view()?.iter().sum();
+
+    // Output results
+    if !quiet {
+        println!("Exact mesh volume           : {:.12}", exact_volume);
+        println!("Computed mesh volume        : {:.12}", volume);
+        println!(
+            "Volume error                : {:.12e}",
+            volume - exact_volume
+        );
+    }
+    let tolerance = match dim {
+        1 => 200.0 * libceed::EPSILON,
+        _ => 1E-5,
+    };
+    let error = (volume - exact_volume).abs();
+    if error > tolerance {
+        println!("Volume error too large: {:.12e}", error);
+        return Err(libceed::Error {
+            message: format!(
+                "Volume error too large - expected: {:.12e}, actual: {:.12e}",
+                tolerance, error
+            ),
+        });
+    }
+    Ok(())
+}
+
+// ----------------------------------------------------------------------------
+// Tests
+// ----------------------------------------------------------------------------
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn example_3_1d() {
+        let options = opt::Opt {
+            ceed_spec: "/cpu/self/ref/serial".to_string(),
+            dim: 1,
+            mesh_degree: 4,
+            solution_degree: 4,
+            num_qpts: 6,
+            problem_size_requested: -1,
+            test: true,
+            quiet: true,
+        };
+        assert!(example_3(options).is_ok());
+    }
+
+    #[test]
+    fn example_3_2d() {
+        let options = opt::Opt {
+            ceed_spec: "/cpu/self/ref/serial".to_string(),
+            dim: 2,
+            mesh_degree: 4,
+            solution_degree: 4,
+            num_qpts: 6,
+            problem_size_requested: -1,
+            test: true,
+            quiet: true,
+        };
+        assert!(example_3(options).is_ok());
+    }
+
+    #[test]
+    fn example_3_3d() {
+        let options = opt::Opt {
+            ceed_spec: "/cpu/self/ref/serial".to_string(),
+            dim: 3,
+            mesh_degree: 4,
+            solution_degree: 4,
+            num_qpts: 6,
+            problem_size_requested: -1,
+            test: true,
+            quiet: false,
+        };
+        assert!(example_3(options).is_ok());
+    }
+}
+
+// ----------------------------------------------------------------------------
diff --git a/examples/rust/ex3-volume/src/opt.rs b/examples/rust/ex3-volume/src/opt.rs
new file mode 100644
index 0000000000..edf546b032
--- /dev/null
+++ b/examples/rust/ex3-volume/src/opt.rs
@@ -0,0 +1,45 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+use clap::Parser;
+
+// ----------------------------------------------------------------------------
+// Command line arguments
+// ----------------------------------------------------------------------------
+#[derive(Debug, Parser)]
+#[command(
+    name = "libCEED Rust Example 3 - Volume",
+    about = "This example uses the mass matrix and diffusion matrices to compute the length, area, or volume of a region, depending upon runtime parameters."
+)]
+pub(crate) struct Opt {
+    /// libCEED backend resource to use
+    #[arg(name = "CEED", short, long = "ceed", default_value = "/cpu/self")]
+    pub(crate) ceed_spec: String,
+    /// Mesh dimension
+    #[arg(short, long = "dimension", default_value = "3")]
+    pub(crate) dim: usize,
+    /// Polynomial degree for the mesh
+    #[arg(short, long, default_value = "4")]
+    pub(crate) mesh_degree: usize,
+    /// Polynomial degree for the solution
+    #[arg(short = 'p', long, default_value = "4")]
+    pub(crate) solution_degree: usize,
+    /// Number of quadrature points in 1D
+    #[arg(short = 'q', long, default_value = "6")]
+    pub(crate) num_qpts: usize,
+    /// Approximate problem size
+    #[arg(name = "DoF", short = 's', long = "problem_size", default_value = "-1")]
+    pub(crate) problem_size_requested: i64,
+    /// Test mode
+    #[arg(short, long)]
+    pub(crate) test: bool,
+    /// Quiet mode
+    #[arg(short = 'x', long)]
+    pub(crate) quiet: bool,
+}
+
+// ----------------------------------------------------------------------------
diff --git a/examples/rust/ex3-volume/src/transform.rs b/examples/rust/ex3-volume/src/transform.rs
new file mode 100644
index 0000000000..e022a34860
--- /dev/null
+++ b/examples/rust/ex3-volume/src/transform.rs
@@ -0,0 +1,50 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+// ----------------------------------------------------------------------------
+// Transform mesh coordinates
+// ----------------------------------------------------------------------------
+pub(crate) fn transform_mesh_coordinates(
+    dim: usize,
+    mesh_size: usize,
+    mesh_coords: &mut libceed::Vector,
+) -> libceed::Result<libceed::Scalar> {
+    // Transform coordinates
+    match dim {
+        1 => {
+            for coord in mesh_coords.view_mut()?.iter_mut() {
+                // map [0,1] to [0,1] varying the mesh density
+                *coord = 0.5
+                    + 1.0 / (3.0 as libceed::Scalar).sqrt()
+                        * ((2.0 / 3.0) * std::f64::consts::PI as libceed::Scalar * (*coord - 0.5))
+                            .sin()
+            }
+        }
+        _ => {
+            let num_nodes = mesh_size / dim;
+            let mut coords = mesh_coords.view_mut()?;
+            for i in 0..num_nodes {
+                // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
+                // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
+                let u = coords[i] + 1.;
+                let v = coords[i + num_nodes] * std::f64::consts::PI / 2.;
+                coords[i] = u * v.cos();
+                coords[i + num_nodes] = u * v.sin();
+            }
+        }
+    }
+
+    // Exact volume of transformed region
+    let exact_volume = match dim {
+        1 => 1.,
+        2 | 3 => 3. / 4. * std::f64::consts::PI,
+        _ => unreachable!(),
+    };
+    Ok(exact_volume)
+}
+
+// ----------------------------------------------------------------------------
diff --git a/examples/rust/mesh/src/lib.rs b/examples/rust/mesh/src/lib.rs
index 9ad2810381..ce48153b18 100644
--- a/examples/rust/mesh/src/lib.rs
+++ b/examples/rust/mesh/src/lib.rs
@@ -1,11 +1,12 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-use libceed::{prelude::*, Ceed};
+use libceed::{Ceed, ElemRestriction, Vector};
+use std::convert::TryInto;
 
 // ----------------------------------------------------------------------------
 // Determine problem size in each dimension from size and dimenison
@@ -22,16 +23,19 @@ pub fn cartesian_mesh_size(dim: usize, solution_degree: usize, problem_size: i64
 
     // Size per dimension
     let mut r = s % dim;
-    let mut num_xyz = [0; 3];
-    for d in 0..dim {
-        let mut sd = s / dim;
-        if r > 0 {
-            sd += 1;
-            r -= 1;
-        }
-        num_xyz[d] = 1 << sd;
-    }
-    num_xyz
+    let xyz: [usize; 3] = (0..3)
+        .map(|_| -> usize {
+            let mut sd = s / dim;
+            if r > 0 {
+                sd += 1;
+                r -= 1;
+            }
+            1 << sd
+        })
+        .collect::<Vec<usize>>()
+        .try_into()
+        .unwrap();
+    xyz
 }
 
 // ----------------------------------------------------------------------------
@@ -44,7 +48,7 @@ pub fn build_cartesian_restriction(
     degree: usize,
     num_comp: usize,
     num_qpts: usize,
-) -> libceed::Result<(ElemRestriction, ElemRestriction)> {
+) -> libceed::Result<(ElemRestriction<'_>, ElemRestriction<'_>)> {
     let p = degree + 1;
     let num_nodes = p.pow(dim as u32); // number of nodes per element
     let elem_qpts = num_qpts.pow(dim as u32); // number of quadrature pts per element
@@ -91,17 +95,17 @@ pub fn build_cartesian_restriction(
         num_comp,
         scalar_size,
         num_comp * scalar_size,
-        MemType::Host,
+        libceed::MemType::Host,
         &elem_nodes,
     )?;
 
-    // Quadratue data restriction
+    // Quadrature data restriction
     let rstr_qdata = ceed.strided_elem_restriction(
         num_elem,
         elem_qpts,
         num_comp,
         num_comp * elem_qpts * num_elem,
-        CEED_STRIDES_BACKEND,
+        libceed::CEED_STRIDES_BACKEND,
     )?;
     Ok((rstr, rstr_qdata))
 }
@@ -115,7 +119,7 @@ pub fn cartesian_mesh_coords(
     num_xyz: [usize; 3],
     mesh_degree: usize,
     mesh_size: usize,
-) -> libceed::Result<Vector> {
+) -> libceed::Result<Vector<'_>> {
     let p = mesh_degree + 1;
     let mut num_d = [0; 3];
     let mut scalar_size = 1;
@@ -125,13 +129,14 @@ pub fn cartesian_mesh_coords(
     }
 
     // Lobatto points
-    let lobatto_basis = ceed.basis_tensor_H1_Lagrange(1, 1, 2, p, QuadMode::GaussLobatto)?;
+    let lobatto_basis =
+        ceed.basis_tensor_H1_Lagrange(1, 1, 2, p, libceed::QuadMode::GaussLobatto)?;
     let nodes_corners = ceed.vector_from_slice(&[0.0, 1.0])?;
     let mut nodes_full = ceed.vector(p)?;
     lobatto_basis.apply(
         1,
-        TransposeMode::NoTranspose,
-        EvalMode::Interp,
+        libceed::TransposeMode::NoTranspose,
+        libceed::EvalMode::Interp,
         &nodes_corners,
         &mut nodes_full,
     )?;
@@ -146,8 +151,9 @@ pub fn cartesian_mesh_coords(
             let mut r_nodes = gs_nodes;
             for d in 0..dim {
                 let d_1d = r_nodes % num_d[d];
-                coords[gs_nodes + scalar_size * d] =
-                    ((d_1d / (p - 1)) as Scalar + nodes[d_1d % (p - 1)]) / num_xyz[d] as Scalar;
+                coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) as libceed::Scalar
+                    + nodes[d_1d % (p - 1)])
+                    / num_xyz[d] as libceed::Scalar;
                 r_nodes /= num_d[d];
             }
         }
diff --git a/examples/solids/Makefile b/examples/solids/Makefile
index 484d71eda7..490b229acc 100644
--- a/examples/solids/Makefile
+++ b/examples/solids/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
@@ -15,7 +15,7 @@ CEED_DIR ?= ../..
 ceed.pc := $(CEED_DIR)/lib/pkgconfig/ceed.pc
 
 CC = $(call pkgconf, --variable=ccompiler $(PETSc.pc) $(ceed.pc))
-CFLAGS = -std=c99 \
+CFLAGS = -std=c11 \
   $(call pkgconf, --variable=cflags_extra $(PETSc.pc)) \
   $(call pkgconf, --cflags-only-other $(PETSc.pc)) \
   $(OPT)
diff --git a/examples/solids/README.md b/examples/solids/README.md
index 4f0d14ab86..31b4651c3f 100644
--- a/examples/solids/README.md
+++ b/examples/solids/README.md
@@ -1,7 +1,8 @@
 # libCEED: Solid Mechanics Example
 
 This page provides a description of the solid mechanics example for the libCEED library, based on PETSc.
-PETSc v3.17 or a development version of PETSc at commit 0e95d842 or later is required.
+
+Ratel, a more fully featured solid mechanics library, can be found on [GitLab](https://gitlab.com/micromorph/ratel).
 
 This code solves the steady-state static momentum balance equations using unstructured high-order finite/spectral element spatial discretizations.
 In this mini-app, we consider three formulations used in solid mechanics applications: linear elasticity, Neo-Hookean hyperelasticity at small strain, and Neo-Hookean hyperelasticity at finite strain.
@@ -21,7 +22,7 @@ and run with:
 
 ## Runtime options
 
-% inclusion-solids-marker
+<!-- solids-inclusion -->
 
 The elasticity mini-app is controlled via command-line options, the following of which are mandatory.
 
@@ -72,7 +73,7 @@ As an alternative example exploiting {code}`-dm_plex_box_faces`, we consider a {
 Sides 1 through 6 are rotated around $x$-axis:
 
 ```
-./elasticity -problem FSInitial-NH1 -E 1 -nu 0.3 -num_steps 40 -snes_linesearch_type cp -dm_plex_box_faces 4,4,4 -bc_clamp 1,2,3,4,5,6 -bc_clamp_1_rotate 0,0,1,0,.3 -bc_clamp_2_rotate 0,0,1,0,.3 -bc_clamp_3_rotate 0,0,1,0,.3 -bc_clamp_4_rotate 0,0,1,0,.3 -bc_clamp_5_rotate 0,0,1,0,.3 -bc_clamp_6_rotate 0,0,1,0,.3
+./elasticity -problem FS-NH -E 1 -nu 0.3 -num_steps 40 -snes_linesearch_type cp -dm_plex_box_faces 4,4,4 -bc_clamp 1,2,3,4,5,6 -bc_clamp_1_rotate 0,0,1,0,.3 -bc_clamp_2_rotate 0,0,1,0,.3 -bc_clamp_3_rotate 0,0,1,0,.3 -bc_clamp_4_rotate 0,0,1,0,.3 -bc_clamp_5_rotate 0,0,1,0,.3 -bc_clamp_6_rotate 0,0,1,0,.3
 ```
 
 :::{note}
@@ -103,7 +104,7 @@ The command line options just shown are the minimum requirements to run the mini
   -
 
 * - `-problem`
-  - Problem to solve (`Linear`, `SS-NH`, `FSInitial-NH1`, etc.)
+  - Problem to solve (`Linear`, `FS-NH`, `FS-MR`, etc.)
   - `Linear`
 
 * - `-forcing`
diff --git a/examples/solids/elasticity.c b/examples/solids/elasticity.c
index 747f04835f..6c7db8fe97 100644
--- a/examples/solids/elasticity.c
+++ b/examples/solids/elasticity.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -25,9 +25,9 @@
 //
 // Sample meshes can be found at https://github.com/jeremylt/ceedSampleMeshes
 //
-//TESTARGS(name="linear elasticity, MMS")                                 -ceed {ceed_resource} -test -degree 3 -nu 0.3 -E 1 -dm_plex_box_faces 3,3,3
-//TESTARGS(name="Neo-Hookean hyperelasticity, initial configuration 1")   -ceed {ceed_resource} -test -problem FSInitial-NH1 -E 2.8 -nu 0.4 -degree 2 -dm_plex_box_faces 2,2,2 -num_steps 1 -bc_clamp 6 -bc_traction 5 -bc_traction_5 0,0,-.5 -expect_final_strain_energy 2.124627916174e-01
-//TESTARGS(name="Mooney-Rivlin hyperelasticity, initial configuration 1") -ceed {ceed_resource} -test -problem FSInitial-MR1 -mu_1 .5 -mu_2 .5 -nu 0.4 -degree 2 -dm_plex_box_faces 2,2,2 -num_steps 1 -bc_clamp 6 -bc_traction 5 -bc_traction_5 0,0,-.5 -expect_final_strain_energy 2.339138880207e-01
+//TESTARGS(name="linear elasticity, MMS")        -ceed {ceed_resource} -test -degree 3 -nu 0.3 -E 1 -dm_plex_box_faces 3,3,3
+//TESTARGS(name="Neo-Hookean hyperelasticity")   -ceed {ceed_resource} -test -problem FS-NH -E 2.8 -nu 0.4 -degree 2 -dm_plex_box_faces 2,2,2 -num_steps 1 -bc_clamp 6 -bc_traction 5 -bc_traction_5 0,0,-.5 -expect_final_strain_energy 2.124627916174e-01
+//TESTARGS(name="Mooney-Rivlin hyperelasticity") -ceed {ceed_resource} -test -problem FS-MR -mu_1 .5 -mu_2 .5 -nu 0.4 -degree 2 -dm_plex_box_faces 2,2,2 -num_steps 1 -bc_clamp 6 -bc_traction 5 -bc_traction_5 0,0,-.5 -expect_final_strain_energy 2.339138880207e-01
 
 /// @file
 /// CEED elasticity example using PETSc with DMPlex
diff --git a/examples/solids/elasticity.h b/examples/solids/elasticity.h
index 7ac246eccc..9458668b30 100644
--- a/examples/solids/elasticity.h
+++ b/examples/solids/elasticity.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -21,6 +21,6 @@
 #include "include/utils.h"
 #include "problems/problems.h"
 
-#if PETSC_VERSION_LT(3, 21, 0)
-#error "PETSc v3.21 or later is required"
+#if PETSC_VERSION_LT(3, 23, 0)
+#error "PETSc v3.23 or later is required"
 #endif
diff --git a/examples/solids/include/boundary.h b/examples/solids/include/boundary.h
index 7143b7c262..ca5916b682 100644
--- a/examples/solids/include/boundary.h
+++ b/examples/solids/include/boundary.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/cl-options.h b/examples/solids/include/cl-options.h
index 9c56398139..1d4b8fc962 100644
--- a/examples/solids/include/cl-options.h
+++ b/examples/solids/include/cl-options.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/matops.h b/examples/solids/include/matops.h
index ca57b33356..9b1fe843ba 100644
--- a/examples/solids/include/matops.h
+++ b/examples/solids/include/matops.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/misc.h b/examples/solids/include/misc.h
index 5836d14ff6..d6dc668b3f 100644
--- a/examples/solids/include/misc.h
+++ b/examples/solids/include/misc.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/setup-dm.h b/examples/solids/include/setup-dm.h
index 8fcfe7a63b..06c5347c18 100644
--- a/examples/solids/include/setup-dm.h
+++ b/examples/solids/include/setup-dm.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/setup-libceed.h b/examples/solids/include/setup-libceed.h
index be8ad14e9b..870f3bdf16 100644
--- a/examples/solids/include/setup-libceed.h
+++ b/examples/solids/include/setup-libceed.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/structs.h b/examples/solids/include/structs.h
index 8c63ce1199..f553002f93 100644
--- a/examples/solids/include/structs.h
+++ b/examples/solids/include/structs.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/include/utils.h b/examples/solids/include/utils.h
index 31188d47e3..709be45d3d 100644
--- a/examples/solids/include/utils.h
+++ b/examples/solids/include/utils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/index.md b/examples/solids/index.md
index 910959b244..6d164bd9df 100644
--- a/examples/solids/index.md
+++ b/examples/solids/index.md
@@ -34,7 +34,7 @@ $$ (hyperelastic-cd)
 ## Running the mini-app
 
 ```{include} README.md
-:start-after: inclusion-solids-marker
+:start-after: <!-- solids-inclusion -->
 ```
 
 (problem-linear-elasticity)=
@@ -526,254 +526,3 @@ In the case where complete linearization is preferred, note the symmetry $\maths
 Along with 6 entries for $\bm S$, this totals 27 entries of overhead compared to computing everything from $\bm F$.
 This compares with 13 entries of overhead for direct storage of $\{ \bm S, \bm C^{-1}, \log J \}$, which is sufficient for the Neo-Hookean model to avoid all but matrix products.
 :::
-
-(problem-hyperelasticity-finite-strain-current-configuration)=
-
-## Hyperelasticity in current configuration
-
-In the preceeding discussion, all equations have been formulated in the initial configuration.
-This may feel convenient in that the computational domain is clearly independent of the solution, but there are some advantages to defining the equations in the current configuration.
-
-1. Body forces (like gravity), traction, and contact are more easily defined in the current configuration.
-2. Mesh quality in the initial configuration can be very bad for large deformation.
-3. The required storage and numerical representation can be smaller in the current configuration.
-
-Most of the benefit in case 3 can be attained solely by moving the Jacobian representation to the current configuration {cite}`davydov2020matrix`, though residual evaluation may also be slightly faster in current configuration.
-There are multiple commuting paths from the nonlinear weak form in initial configuration {eq}`hyperelastic-weak-form-initial` to the Jacobian weak form in current configuration {eq}`jacobian-weak-form-current`.
-One may push forward to the current configuration and then linearize or linearize in initial configuration and then push forward, as summarized below.
-
-$$
-\begin{CD}
-  {\overbrace{\nabla_X \bm{v} \tcolon \bm{FS}}^{\text{Initial Residual}}}
-  @>{\text{push forward}}>{}>
-  {\overbrace{\nabla_x \bm{v} \tcolon \bm{\tau}}^{\text{Current Residual}}} \\
-  @V{\text{linearize}}V{\begin{smallmatrix} \diff\bm F = \nabla_X\diff\bm u \\ \diff\bm S(\diff\bm E) \end{smallmatrix}}V
-  @V{\begin{smallmatrix} \diff\nabla_x\bm v = -\nabla_x\bm v \nabla_x \diff\bm u \\ \diff\bm\tau(\diff\bm\epsilon) \end{smallmatrix}}V{\text{linearize}}V \\
-  {\underbrace{\nabla_X\bm{v}\tcolon \Big(\diff\bm{F}\bm{S} + \bm{F}\diff\bm{S}\Big)}_\text{Initial Jacobian}}
-  @>{\text{push forward}}>{}>
-  {\underbrace{\nabla_x\bm{v}\tcolon \Big(\diff\bm{\tau} -\bm{\tau}(\nabla_x \diff\bm{u})^T \Big)}_\text{Current Jacobian}}
-\end{CD}
-$$ (initial-current-linearize)
-
-We will follow both paths for consistency and because both intermediate representations may be useful for implementation.
-
-### Push forward, then linearize
-
-The first term of {eq}`hyperelastic-weak-form-initial` can be rewritten in terms of the symmetric Kirchhoff stress tensor
-$\bm{\tau}=J\bm{\sigma}=\bm{P}\bm{F}^T = \bm F \bm S \bm F^T$ as
-
-$$
-\nabla_X \bm{v} \tcolon \bm{P} = \nabla_X \bm{v} \tcolon \bm{\tau}\bm{F}^{-T} = \nabla_X \bm{v}\bm{F}^{-1} \tcolon \bm{\tau} = \nabla_x \bm{v} \tcolon \bm{\tau}
-$$
-
-therefore, the weak form in terms of $\bm{\tau}$ and $\nabla_x$ with integral over $\Omega_0$ is
-
-$$
-\int_{\Omega_0}{\nabla_x \bm{v} \tcolon \bm{\tau}} \, dV
- - \int_{\Omega_0}{\bm{v} \cdot \rho_0 \bm{g}} \, dV
- - \int_{\partial \Omega_0}{\bm{v}\cdot(\bm{P}\cdot\hat{\bm{N}})} \, dS
- = 0, \quad \forall \bm v \in \mathcal V.
-$$ (hyperelastic-weak-form-current)
-
-#### Linearize in current configuration
-
-To derive a Newton linearization of {eq}`hyperelastic-weak-form-current`, first we define
-
-$$
-\nabla_x \diff \bm{u} = \nabla_X \diff \bm{u} \  \bm{F}^{-1} = \diff \bm{F} \bm{F}^{-1}
-$$ (nabla_xdu)
-
-and $\bm{\tau}$ for Neo-Hookean materials as the push forward of {eq}`neo-hookean-stress`
-
-$$
-\bm{\tau} = \bm{F}\bm{S}\bm{F}^T = \mu (\bm{b} - \bm I_3) + \lambda \log J \bm{I}_3,
-$$ (tau-neo-hookean)
-
-where $\bm{b} = \bm{F} \bm{F}^T$, is the left Cauchy-Green tensor.
-Then by expanding the directional derivative of $\nabla_x \bm{v} \tcolon \bm{\tau}$, we arrive at
-
-$$
-\diff \ (\nabla_x \bm{v} \tcolon \bm{\tau}) = \diff \ (\nabla_x \bm{v})\tcolon \bm{\tau} + \nabla_x \bm{v} \tcolon \diff \bm{\tau} .
-$$ (hyperelastic-linearization-current1)
-
-The first term of {eq}`hyperelastic-linearization-current1` can be written as
-
-$$
-\begin{aligned} \diff \ (\nabla_x \bm{v})\tcolon \bm{\tau} &= \diff \ (\nabla_X \bm{v} \bm{F}^{-1})\tcolon \bm{\tau} = \Big(\underbrace{\nabla_X (\diff \bm{v})}_{0}\bm{F}^{-1} +  \nabla_X \bm{v}\diff \bm{F}^{-1}\Big)\tcolon \bm{\tau}\\   &= \Big(-\nabla_X \bm{v} \bm{F}^{-1}\diff\bm{F}\bm{F}^{-1}\Big)\tcolon \bm{\tau}=\Big(-\nabla_x \bm{v} \diff\bm{F}\bm{F}^{-1}\Big)\tcolon \bm{\tau}\\   &= \Big(-\nabla_x \bm{v} \nabla_x \diff\bm{u} \Big)\tcolon \bm{\tau}= -\nabla_x \bm{v}\tcolon\bm{\tau}(\nabla_x \diff\bm{u})^T \,, \end{aligned}
-$$
-
-where we have used $\diff \bm{F}^{-1}=-\bm{F}^{-1} \diff \bm{F} \bm{F}^{-1}$ and {eq}`nabla_xdu`.
-Using this and {eq}`hyperelastic-linearization-current1` in {eq}`hyperelastic-weak-form-current` yields the weak form in the current configuration
-
-$$
-\int_{\Omega_0} \nabla_x \bm v \tcolon \Big(\diff\bm\tau - \bm\tau (\nabla_x \diff\bm u)^T \Big) = \text{rhs}.
-$$ (jacobian-weak-form-current)
-
-In the following, we will sometimes make use of the incremental strain tensor in the current configuration,
-
-$$
-\diff\bm\epsilon \equiv \frac{1}{2}\Big(\nabla_x \diff\bm{u} + (\nabla_x \diff\bm{u})^T   \Big) .
-$$
-
-:::{dropdown} Deriving $\diff\bm\tau$ for Neo-Hookean material
-To derive a useful expression of $\diff\bm\tau$ for Neo-Hookean materials, we will use the representations
-
-$$
-\begin{aligned}
-\diff \bm{b} &= \diff \bm{F} \bm{F}^T + \bm{F} \diff \bm{F}^T \\
-&= \nabla_x \diff \bm{u} \ \bm{b} + \bm{b} \ (\nabla_x \diff \bm{u})^T \\
-&= (\nabla_x \diff\bm u)(\bm b - \bm I_3) + (\bm b - \bm I_3) (\nabla_x \diff\bm u)^T + 2 \diff\bm\epsilon
-\end{aligned}
-$$
-
-and
-
-$$
-\begin{aligned} \diff\ (\log J) &= \frac{\partial \log J}{\partial \bm{b}}\tcolon \diff \bm{b} = \frac{\partial J}{J\partial \bm{b}}\tcolon \diff \bm{b}=\frac{1}{2}\bm{b}^{-1}\tcolon \diff \bm{b} \\ &= \frac 1 2 \bm b^{-1} \tcolon \Big(\nabla_x \diff\bm u \ \bm b + \bm b (\nabla_x \diff\bm u)^T \Big) \\ &= \trace (\nabla_x \diff\bm u) \\ &= \trace \diff\bm\epsilon . \end{aligned}
-$$
-
-Substituting into {eq}`tau-neo-hookean` gives
-
-$$
-\begin{aligned}
-\diff \bm{\tau} &= \mu \diff \bm{b} + \lambda \trace (\diff\bm\epsilon) \bm I_3 \\
-&= \underbrace{2 \mu \diff\bm\epsilon + \lambda \trace (\diff\bm\epsilon) \bm I_3 - 2\lambda \log J \diff\bm\epsilon}_{\bm F \diff\bm S \bm F^T} \\
-&\quad + (\nabla_x \diff\bm u)\underbrace{\Big( \mu (\bm b - \bm I_3) + \lambda \log J \bm I_3 \Big)}_{\bm\tau} \\
-&\quad + \underbrace{\Big( \mu (\bm b - \bm I_3) + \lambda \log J \bm I_3 \Big)}_{\bm\tau}  (\nabla_x \diff\bm u)^T ,
-\end{aligned}
-$$ (dtau-neo-hookean)
-
-where the final expression has been identified according to
-
-$$
-\diff\bm\tau = \diff\ (\bm F \bm S \bm F^T) = (\nabla_x \diff\bm u) \bm\tau + \bm F \diff\bm S \bm F^T + \bm\tau(\nabla_x \diff\bm u)^T.
-$$
-:::
-
-Collecting terms, we may thus opt to use either of the two forms
-
-$$
-\begin{aligned}
-\diff \bm{\tau} -\bm{\tau}(\nabla_x \diff\bm{u})^T &= (\nabla_x \diff\bm u)\bm\tau + \bm F \diff\bm S \bm F^T \\
-&= (\nabla_x \diff\bm u)\bm\tau + \lambda \trace(\diff\bm\epsilon) \bm I_3 + 2(\mu - \lambda \log J) \diff\bm\epsilon,
-\end{aligned}
-$$ (cur_simp_Jac)
-
-with the last line showing the especially compact representation available for Neo-Hookean materials.
-
-### Linearize, then push forward
-
-We can move the derivatives to the current configuration via
-
-$$
-\nabla_X \bm v \!:\! \diff\bm P = (\nabla_X \bm v) \bm F^{-1} \!:\! \diff \bm P \bm F^T = \nabla_x \bm v \!:\! \diff\bm P \bm F^T
-$$
-
-and expand
-
-$$
-\begin{aligned}
-\diff\bm P \bm F^T &= \diff\bm F \bm S \bm F^T + \bm F \diff\bm S \bm F^T \\
-&= \underbrace{\diff\bm F \bm F^{-1}}_{\nabla_x \diff\bm u} \underbrace{\bm F \bm S \bm F^T}_{\bm\tau} + \bm F \diff\bm S \bm F^T .
-\end{aligned}
-$$
-
-:::{dropdown} Representation of $\bm F \diff\bm S \bm F^T$ for Neo-Hookean materials
-Now we push {eq}`eq-neo-hookean-incremental-stress` forward via
-
-$$
-\begin{aligned}
-\bm F \diff\bm S \bm F^T &= \lambda (\bm C^{-1} \!:\! \diff\bm E) \bm F \bm C^{-1} \bm F^T
-  + 2 (\mu - \lambda \log J) \bm F \bm C^{-1} \diff\bm E \, \bm C^{-1} \bm F^T \\
-    &= \lambda (\bm C^{-1} \!:\! \diff\bm E) \bm I_3 + 2 (\mu - \lambda \log J) \bm F^{-T} \diff\bm E \, \bm F^{-1} \\
-    &= \lambda \operatorname{trace}(\nabla_x \diff\bm u) \bm I_3 + 2 (\mu - \lambda \log J) \diff\bm \epsilon
-\end{aligned}
-$$
-
-where we have used
-
-$$
-\begin{aligned}
-\bm C^{-1} \!:\! \diff\bm E &= \bm F^{-1} \bm F^{-T} \!:\! \bm F^T \diff\bm F \\
-&= \operatorname{trace}(\bm F^{-1} \bm F^{-T} \bm F^T \diff \bm F) \\
-&= \operatorname{trace}(\bm F^{-1} \diff\bm F) \\
-&= \operatorname{trace}(\diff \bm F \bm F^{-1}) \\
-&= \operatorname{trace}(\nabla_x \diff\bm u)
-\end{aligned}
-$$
-
-and
-
-$$
-\begin{aligned}
-\bm F^{-T} \diff\bm E \, \bm F^{-1} &= \frac 1 2 \bm F^{-T} (\bm F^T \diff\bm F + \diff\bm F^T \bm F) \bm F^{-1} \\
-&= \frac 1 2 (\diff \bm F \bm F^{-1} + \bm F^{-T} \diff\bm F^T) \\
-&= \frac 1 2 \Big(\nabla_x \diff\bm u + (\nabla_x\diff\bm u)^T \Big) \equiv \diff\bm\epsilon.
-\end{aligned}
-$$
-:::
-
-Collecting terms, the weak form of the Newton linearization for Neo-Hookean materials in the current configuration is
-
-$$
-\int_{\Omega_0} \nabla_x \bm v \!:\! \Big( (\nabla_x \diff\bm u) \bm\tau + \lambda \operatorname{trace}(\diff\bm\epsilon)\bm I_3 + 2(\mu - \lambda\log J)\diff \bm\epsilon \Big) dV = \text{rhs},
-$$ (jacobian-weak-form-current2)
-
-which equivalent to Algorithm 2 of {cite}`davydov2020matrix` and requires only derivatives with respect to the current configuration. Note that {eq}`cur_simp_Jac` and {eq}`jacobian-weak-form-current2` have recovered the same representation
-using different algebraic manipulations.
-
-:::{tip}
-We define a second order *Green-Euler* strain tensor (cf. Green-Lagrange strain {eq}`eq-green-lagrange-strain`) as
-
-$$
-\bm e = \frac 1 2 \Big(\bm{b} - \bm{I}_3 \Big) = \frac 1 2 \Big( \nabla_X \bm{u} + (\nabla_X \bm{u})^T + \nabla_X \bm{u} \, (\nabla_X \bm{u})^T \Big).
-$$ (green-euler-strain)
-
-Then, the Kirchhoff stress tensor {eq}`tau-neo-hookean` can be written as
-
-$$
-\bm \tau = \lambda \log J \bm I_{3} + 2\mu \bm e,
-$$ (tau-neo-hookean-stable)
-
-which is more numerically stable for small strain, and thus preferred for computation. Note that the $\log J$ is computed via `log1p` {eq}`log1p`, as we discussed in the previous tip.
-:::
-
-### Jacobian representation
-
-We have implemented four storage variants for the Jacobian in our finite strain hyperelasticity. In each case, some variables are computed during residual evaluation and used during Jacobian application.
-
-:::{list-table} Four algorithms for Jacobian action in finite strain hyperelasticity problem
-:header-rows: 1
-:widths: auto
-
-* - Option `-problem`
-  - Static storage
-  - Computed storage
-  - \# scalars
-  - Equations
-
-
-* - `FSInitial-NH1`
-  - $\nabla_{X} \hat X, \operatorname{det}\nabla_{\hat X} X$
-  - $\nabla_X \bm u$
-  - 19
-  - {eq}`eq-diff-P` {eq}`eq-neo-hookean-incremental-stress`
-
-* - `FSInitial-NH2`
-  - $\nabla_{X} \hat X, \operatorname{det}\nabla_{\hat X} X$
-  - $\nabla_X \bm u, \bm C^{-1}, \lambda \log J$
-  - 26
-  - {eq}`eq-diff-P` {eq}`eq-neo-hookean-incremental-stress`
-
-* - `FSCurrent-NH1`
-  - $\nabla_{X} \hat X, \operatorname{det}\nabla_{\hat X} X$
-  - $\nabla_X \bm u$
-  - 19
-  - {eq}`jacobian-weak-form-current` {eq}`nabla_xdu`
-
-* - `FSCurrent-NH2`
-  - $\operatorname{det}\nabla_{\hat X} X$
-  - $\nabla_x \hat X, \bm \tau, \lambda \log J$
-  - 17
-  - {eq}`jacobian-weak-form-current` {eq}`jacobian-weak-form-current2`
-:::
diff --git a/examples/solids/problems/cl-problems.h b/examples/solids/problems/cl-problems.h
index 249e1fd604..f596a01b60 100644
--- a/examples/solids/problems/cl-problems.h
+++ b/examples/solids/problems/cl-problems.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,22 +7,7 @@
 #pragma once
 
 // Problem options
-typedef enum {
-  ELAS_LINEAR        = 0,
-  ELAS_SS_NH         = 1,
-  ELAS_FSInitial_NH1 = 2,
-  ELAS_FSInitial_NH2 = 3,
-  ELAS_FSCurrent_NH1 = 4,
-  ELAS_FSCurrent_NH2 = 5,
-  ELAS_FSInitial_MR1 = 6
-} problemType;
-static const char *const problemTypes[]        = {"Linear",        "SS-NH",         "FSInitial-NH1", "FSInitial-NH2", "FSCurrent-NH1",
-                                                  "FSCurrent-NH2", "FSInitial-MR1", "problemType",   "ELAS_",         0};
-static const char *const problemTypesForDisp[] = {
-    "Linear elasticity",
-    "Hyperelasticity small strain, Neo-Hookean",
-    "Hyperelasticity finite strain Initial configuration Neo-Hookean w/ dXref_dxinit, Grad(u) storage",
-    "Hyperelasticity finite strain Initial configuration Neo-Hookean w/ dXref_dxinit, Grad(u), C_inv, constant storage",
-    "Hyperelasticity finite strain Current configuration Neo-Hookean w/ dXref_dxinit, Grad(u) storage",
-    "Hyperelasticity finite strain Current configuration Neo-Hookean w/ dXref_dxcurr, tau, constant storage",
-    "Hyperelasticity finite strain Initial configuration Moony-Rivlin w/ dXref_dxinit, Grad(u) storage"};
+typedef enum { ELAS_LINEAR = 0, ELAS_FS_NH = 2, ELAS_FS_MR = 2 } problemType;
+static const char *const problemTypes[]        = {"Linear", "FS-NH", "FS-MR", "problemType", "ELAS_", 0};
+static const char *const problemTypesForDisp[] = {"Linear elasticity", "Hyperelasticity finite strain Initial configuration Neo-Hookean",
+                                                  "Hyperelasticity finite strain Initial configuration Moony-Rivlin"};
diff --git a/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c b/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c
deleted file mode 100644
index 57d37efd63..0000000000
--- a/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/finite-strain-mooney-rivlin-initial-1.h"
-
-#include <ceed.h>
-#include <petscsys.h>
-
-#include "../include/setup-libceed.h"
-#include "../include/structs.h"
-#include "../problems/mooney-rivlin.h"
-#include "../problems/problems.h"
-#include "../qfunctions/common.h"
-
-static const char *const field_names[] = {"gradu"};
-static CeedInt           field_sizes[] = {9};
-
-ProblemData finite_strain_Mooney_Rivlin_initial_1 = {
-    .setup_geo            = SetupGeo,
-    .setup_geo_loc        = SetupGeo_loc,
-    .q_data_size          = 10,
-    .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasFSInitialMR1F,
-    .residual_loc         = ElasFSInitialMR1F_loc,
-    .number_fields_stored = 1,
-    .field_names          = field_names,
-    .field_sizes          = field_sizes,
-    .jacobian             = ElasFSInitialMR1dF,
-    .jacobian_loc         = ElasFSInitialMR1dF_loc,
-    .energy               = ElasFSInitialMR1Energy,
-    .energy_loc           = ElasFSInitialMR1Energy_loc,
-    .diagnostic           = ElasFSInitialMR1Diagnostic,
-    .diagnostic_loc       = ElasFSInitialMR1Diagnostic_loc,
-};
-
-PetscErrorCode SetupLibceedFineLevel_ElasFSInitialMR1(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
-                                                      PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size,
-                                                      CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_Mooney_Rivlin_initial_1, fine_level,
-                                  num_comp_u, U_g_size, U_loc_size, force_ceed, neumann_ceed, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-PetscErrorCode SetupLibceedLevel_ElasFSInitialMR1(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
-                                                  PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_Mooney_Rivlin_initial_1, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
diff --git a/examples/solids/problems/finite-strain-mooney-rivlin.c b/examples/solids/problems/finite-strain-mooney-rivlin.c
new file mode 100644
index 0000000000..9798eeb26e
--- /dev/null
+++ b/examples/solids/problems/finite-strain-mooney-rivlin.c
@@ -0,0 +1,58 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "../qfunctions/finite-strain-mooney-rivlin.h"
+
+#include <ceed.h>
+#include <petscsys.h>
+
+#include "../include/setup-libceed.h"
+#include "../include/structs.h"
+#include "../problems/mooney-rivlin.h"
+#include "../problems/problems.h"
+#include "../qfunctions/common.h"
+
+static const char *const field_names[] = {"gradu"};
+static CeedInt           field_sizes[] = {9};
+
+ProblemData finite_strain_Mooney_Rivlin = {
+    .setup_geo            = SetupGeo,
+    .setup_geo_loc        = SetupGeo_loc,
+    .q_data_size          = 10,
+    .quadrature_mode      = CEED_GAUSS,
+    .residual             = ElasFSResidual_MR,
+    .residual_loc         = ElasFSResidual_MR_loc,
+    .number_fields_stored = 1,
+    .field_names          = field_names,
+    .field_sizes          = field_sizes,
+    .jacobian             = ElasFSJacobian_MR,
+    .jacobian_loc         = ElasFSJacobian_MR_loc,
+    .energy               = ElasFSEnergy_MR,
+    .energy_loc           = ElasFSEnergy_MR_loc,
+    .diagnostic           = ElasFSDiagnostic_MR,
+    .diagnostic_loc       = ElasFSDiagnostic_MR_loc,
+};
+
+PetscErrorCode SetupLibceedFineLevel_ElasFSMR(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
+                                              PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, CeedVector force_ceed,
+                                              CeedVector neumann_ceed, CeedData *data) {
+  PetscFunctionBegin;
+
+  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_Mooney_Rivlin, fine_level, num_comp_u,
+                                  U_g_size, U_loc_size, force_ceed, neumann_ceed, data));
+
+  PetscFunctionReturn(PETSC_SUCCESS);
+};
+
+PetscErrorCode SetupLibceedLevel_ElasFSMR(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
+                                          PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
+  PetscFunctionBegin;
+
+  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_Mooney_Rivlin, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
+
+  PetscFunctionReturn(PETSC_SUCCESS);
+};
diff --git a/examples/solids/problems/finite-strain-neo-hookean-current-1.c b/examples/solids/problems/finite-strain-neo-hookean-current-1.c
deleted file mode 100644
index e6ad6a8a99..0000000000
--- a/examples/solids/problems/finite-strain-neo-hookean-current-1.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/finite-strain-neo-hookean-current-1.h"
-
-#include <ceed.h>
-#include <petscsys.h>
-
-#include "../include/setup-libceed.h"
-#include "../include/structs.h"
-#include "../problems/neo-hookean.h"
-#include "../problems/problems.h"
-#include "../qfunctions/common.h"
-
-static const char *const field_names[] = {"gradu"};
-static CeedInt           field_sizes[] = {9};
-
-ProblemData finite_strain_neo_Hookean_current_1 = {
-    .setup_geo            = SetupGeo,
-    .setup_geo_loc        = SetupGeo_loc,
-    .q_data_size          = 10,
-    .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasFSCurrentNH1F,
-    .residual_loc         = ElasFSCurrentNH1F_loc,
-    .number_fields_stored = 1,
-    .field_names          = field_names,
-    .field_sizes          = field_sizes,
-    .jacobian             = ElasFSCurrentNH1dF,
-    .jacobian_loc         = ElasFSCurrentNH1dF_loc,
-    .energy               = ElasFSCurrentNH1Energy,
-    .energy_loc           = ElasFSCurrentNH1Energy_loc,
-    .diagnostic           = ElasFSCurrentNH1Diagnostic,
-    .diagnostic_loc       = ElasFSCurrentNH1Diagnostic_loc,
-};
-
-PetscErrorCode SetupLibceedFineLevel_ElasFSCurrentNH1(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
-                                                      PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size,
-                                                      CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_current_1, fine_level, num_comp_u,
-                                  U_g_size, U_loc_size, force_ceed, neumann_ceed, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-PetscErrorCode SetupLibceedLevel_ElasFSCurrentNH1(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
-                                                  PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_current_1, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
diff --git a/examples/solids/problems/finite-strain-neo-hookean-current-2.c b/examples/solids/problems/finite-strain-neo-hookean-current-2.c
deleted file mode 100644
index 78f34d3ee7..0000000000
--- a/examples/solids/problems/finite-strain-neo-hookean-current-2.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/finite-strain-neo-hookean-current-2.h"
-
-#include <ceed.h>
-#include <petscsys.h>
-
-#include "../include/setup-libceed.h"
-#include "../include/structs.h"
-#include "../problems/neo-hookean.h"
-#include "../problems/problems.h"
-#include "../qfunctions/common.h"
-
-static const char *const field_names[] = {"dXdx", "tau", "lambda_log_J"};
-static CeedInt           field_sizes[] = {9, 6, 1};
-
-ProblemData finite_strain_neo_Hookean_current_2 = {
-    .setup_geo            = SetupGeo,
-    .setup_geo_loc        = SetupGeo_loc,
-    .q_data_size          = 10,
-    .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasFSCurrentNH2F,
-    .residual_loc         = ElasFSCurrentNH2F_loc,
-    .number_fields_stored = 3,
-    .field_names          = field_names,
-    .field_sizes          = field_sizes,
-    .jacobian             = ElasFSCurrentNH2dF,
-    .jacobian_loc         = ElasFSCurrentNH2dF_loc,
-    .energy               = ElasFSCurrentNH2Energy,
-    .energy_loc           = ElasFSCurrentNH2Energy_loc,
-    .diagnostic           = ElasFSCurrentNH2Diagnostic,
-    .diagnostic_loc       = ElasFSCurrentNH2Diagnostic_loc,
-};
-
-PetscErrorCode SetupLibceedFineLevel_ElasFSCurrentNH2(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
-                                                      PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size,
-                                                      CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_current_2, fine_level, num_comp_u,
-                                  U_g_size, U_loc_size, force_ceed, neumann_ceed, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-PetscErrorCode SetupLibceedLevel_ElasFSCurrentNH2(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
-                                                  PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_current_2, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
diff --git a/examples/solids/problems/finite-strain-neo-hookean-initial-1.c b/examples/solids/problems/finite-strain-neo-hookean-initial-1.c
deleted file mode 100644
index cb45b602ad..0000000000
--- a/examples/solids/problems/finite-strain-neo-hookean-initial-1.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/finite-strain-neo-hookean-initial-1.h"
-
-#include <ceed.h>
-#include <petscsys.h>
-
-#include "../include/setup-libceed.h"
-#include "../include/structs.h"
-#include "../problems/neo-hookean.h"
-#include "../problems/problems.h"
-#include "../qfunctions/common.h"
-
-static const char *const field_names[] = {"gradu"};
-static CeedInt           field_sizes[] = {9};
-
-ProblemData finite_strain_neo_Hookean_initial_1 = {
-    .setup_geo            = SetupGeo,
-    .setup_geo_loc        = SetupGeo_loc,
-    .q_data_size          = 10,
-    .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasFSInitialNH1F,
-    .residual_loc         = ElasFSInitialNH1F_loc,
-    .number_fields_stored = 1,
-    .field_names          = field_names,
-    .field_sizes          = field_sizes,
-    .jacobian             = ElasFSInitialNH1dF,
-    .jacobian_loc         = ElasFSInitialNH1dF_loc,
-    .energy               = ElasFSInitialNH1Energy,
-    .energy_loc           = ElasFSInitialNH1Energy_loc,
-    .diagnostic           = ElasFSInitialNH1Diagnostic,
-    .diagnostic_loc       = ElasFSInitialNH1Diagnostic_loc,
-};
-
-PetscErrorCode SetupLibceedFineLevel_ElasFSInitialNH1(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
-                                                      PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size,
-                                                      CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_initial_1, fine_level, num_comp_u,
-                                  U_g_size, U_loc_size, force_ceed, neumann_ceed, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-PetscErrorCode SetupLibceedLevel_ElasFSInitialNH1(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
-                                                  PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_initial_1, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
diff --git a/examples/solids/problems/finite-strain-neo-hookean-initial-2.c b/examples/solids/problems/finite-strain-neo-hookean-initial-2.c
deleted file mode 100644
index 9d52b35aec..0000000000
--- a/examples/solids/problems/finite-strain-neo-hookean-initial-2.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include "../qfunctions/finite-strain-neo-hookean-initial-2.h"
-
-#include <ceed.h>
-#include <petscsys.h>
-
-#include "../include/setup-libceed.h"
-#include "../include/structs.h"
-#include "../problems/neo-hookean.h"
-#include "../problems/problems.h"
-#include "../qfunctions/common.h"
-
-static const char *const field_names[] = {"gradu", "C_inv", "lambda_log_J"};
-static CeedInt           field_sizes[] = {9, 6, 1};
-
-ProblemData finite_strain_neo_Hookean_initial_2 = {
-    .setup_geo            = SetupGeo,
-    .setup_geo_loc        = SetupGeo_loc,
-    .q_data_size          = 10,
-    .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasFSInitialNH2F,
-    .residual_loc         = ElasFSInitialNH2F_loc,
-    .number_fields_stored = 3,
-    .field_names          = field_names,
-    .field_sizes          = field_sizes,
-    .jacobian             = ElasFSInitialNH2dF,
-    .jacobian_loc         = ElasFSInitialNH2dF_loc,
-    .energy               = ElasFSInitialNH2Energy,
-    .energy_loc           = ElasFSInitialNH2Energy_loc,
-    .diagnostic           = ElasFSInitialNH2Diagnostic,
-    .diagnostic_loc       = ElasFSInitialNH2Diagnostic_loc,
-};
-
-PetscErrorCode SetupLibceedFineLevel_ElasFSInitialNH2(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
-                                                      PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size,
-                                                      CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_initial_2, fine_level, num_comp_u,
-                                  U_g_size, U_loc_size, force_ceed, neumann_ceed, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
-
-PetscErrorCode SetupLibceedLevel_ElasFSInitialNH2(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
-                                                  PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
-  PetscFunctionBegin;
-
-  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_initial_2, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
-
-  PetscFunctionReturn(PETSC_SUCCESS);
-};
diff --git a/examples/solids/problems/small-strain-neo-hookean.c b/examples/solids/problems/finite-strain-neo-hookean.c
similarity index 63%
rename from examples/solids/problems/small-strain-neo-hookean.c
rename to examples/solids/problems/finite-strain-neo-hookean.c
index be2fb27c43..3948d257e3 100644
--- a/examples/solids/problems/small-strain-neo-hookean.c
+++ b/examples/solids/problems/finite-strain-neo-hookean.c
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include "../qfunctions/small-strain-neo-hookean.h"
+#include "../qfunctions/finite-strain-neo-hookean.h"
 
 #include <ceed.h>
 #include <petscsys.h>
@@ -19,40 +19,40 @@
 static const char *const field_names[] = {"gradu"};
 static CeedInt           field_sizes[] = {9};
 
-ProblemData small_strain_neo_Hookean = {
+ProblemData finite_strain_neo_Hookean = {
     .setup_geo            = SetupGeo,
     .setup_geo_loc        = SetupGeo_loc,
     .q_data_size          = 10,
     .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasSSNHF,
-    .residual_loc         = ElasSSNHF_loc,
+    .residual             = ElasFSResidual_NH,
+    .residual_loc         = ElasFSResidual_NH_loc,
     .number_fields_stored = 1,
     .field_names          = field_names,
     .field_sizes          = field_sizes,
-    .jacobian             = ElasSSNHdF,
-    .jacobian_loc         = ElasSSNHdF_loc,
-    .energy               = ElasSSNHEnergy,
-    .energy_loc           = ElasSSNHEnergy_loc,
-    .diagnostic           = ElasSSNHDiagnostic,
-    .diagnostic_loc       = ElasSSNHDiagnostic_loc,
+    .jacobian             = ElasFSJacobian_NH,
+    .jacobian_loc         = ElasFSJacobian_NH_loc,
+    .energy               = ElasFSEnergy_NH,
+    .energy_loc           = ElasFSEnergy_NH_loc,
+    .diagnostic           = ElasFSDiagnostic_NH,
+    .diagnostic_loc       = ElasFSDiagnostic_NH_loc,
 };
 
-PetscErrorCode SetupLibceedFineLevel_ElasSSNH(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
+PetscErrorCode SetupLibceedFineLevel_ElasFSNH(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx,
                                               PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, CeedVector force_ceed,
                                               CeedVector neumann_ceed, CeedData *data) {
   PetscFunctionBegin;
 
-  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, small_strain_neo_Hookean, fine_level, num_comp_u, U_g_size,
+  PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean, fine_level, num_comp_u, U_g_size,
                                   U_loc_size, force_ceed, neumann_ceed, data));
 
   PetscFunctionReturn(PETSC_SUCCESS);
 };
 
-PetscErrorCode SetupLibceedLevel_ElasSSNH(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
+PetscErrorCode SetupLibceedLevel_ElasFSNH(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size,
                                           PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) {
   PetscFunctionBegin;
 
-  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, small_strain_neo_Hookean, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
+  PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data));
 
   PetscFunctionReturn(PETSC_SUCCESS);
 };
diff --git a/examples/solids/problems/linear.c b/examples/solids/problems/linear.c
index c013ee716a..82cb9635d1 100644
--- a/examples/solids/problems/linear.c
+++ b/examples/solids/problems/linear.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -22,15 +22,15 @@ ProblemData linear_elasticity = {
     .setup_geo_loc        = SetupGeo_loc,
     .q_data_size          = 10,
     .quadrature_mode      = CEED_GAUSS,
-    .residual             = ElasLinearF,
-    .residual_loc         = ElasLinearF_loc,
+    .residual             = ElasResidual_Linear,
+    .residual_loc         = ElasResidual_Linear_loc,
     .number_fields_stored = 0,
-    .jacobian             = ElasLineardF,
-    .jacobian_loc         = ElasLineardF_loc,
-    .energy               = ElasLinearEnergy,
-    .energy_loc           = ElasLinearEnergy_loc,
-    .diagnostic           = ElasLinearDiagnostic,
-    .diagnostic_loc       = ElasLinearDiagnostic_loc,
+    .jacobian             = ElasJacobian_Linear,
+    .jacobian_loc         = ElasJacobian_Linear_loc,
+    .energy               = ElasEnergy_Linear,
+    .energy_loc           = ElasEnergy_Linear_loc,
+    .diagnostic           = ElasDiagnostic_Linear,
+    .diagnostic_loc       = ElasDiagnostic_Linear_loc,
     .true_soln            = MMSTrueSoln,
     .true_soln_loc        = MMSTrueSoln_loc,
 };
diff --git a/examples/solids/problems/mooney-rivlin.c b/examples/solids/problems/mooney-rivlin.c
index 2449e98742..4444250187 100644
--- a/examples/solids/problems/mooney-rivlin.c
+++ b/examples/solids/problems/mooney-rivlin.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/mooney-rivlin.h b/examples/solids/problems/mooney-rivlin.h
index 2063e06e19..0903df99d6 100644
--- a/examples/solids/problems/mooney-rivlin.h
+++ b/examples/solids/problems/mooney-rivlin.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/neo-hookean.c b/examples/solids/problems/neo-hookean.c
index 560717673e..dfd2d68005 100644
--- a/examples/solids/problems/neo-hookean.c
+++ b/examples/solids/problems/neo-hookean.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/neo-hookean.h b/examples/solids/problems/neo-hookean.h
index 72f6a2ed65..a80e508001 100644
--- a/examples/solids/problems/neo-hookean.h
+++ b/examples/solids/problems/neo-hookean.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/problems/problems.c b/examples/solids/problems/problems.c
index e125997093..5a819aecf1 100644
--- a/examples/solids/problems/problems.c
+++ b/examples/solids/problems/problems.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -14,12 +14,8 @@ PetscErrorCode RegisterProblems(ProblemFunctions problem_functions) {
   PetscFunctionBegin;
 
   SOLIDS_PROBLEM_REGISTER(problem_functions, "Linear", ElasLinear, NH);
-  SOLIDS_PROBLEM_REGISTER(problem_functions, "SS-NH", ElasSSNH, NH);
-  SOLIDS_PROBLEM_REGISTER(problem_functions, "FSCurrent-NH1", ElasFSCurrentNH1, NH);
-  SOLIDS_PROBLEM_REGISTER(problem_functions, "FSCurrent-NH2", ElasFSCurrentNH2, NH);
-  SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-NH1", ElasFSInitialNH1, NH);
-  SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-NH2", ElasFSInitialNH2, NH);
-  SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-MR1", ElasFSInitialMR1, MR);
+  SOLIDS_PROBLEM_REGISTER(problem_functions, "FS-NH", ElasFSNH, NH);
+  SOLIDS_PROBLEM_REGISTER(problem_functions, "FS-MR", ElasFSMR, MR);
 
   PetscFunctionReturn(PETSC_SUCCESS);
 };
diff --git a/examples/solids/problems/problems.h b/examples/solids/problems/problems.h
index 17503fda72..e71ab1719b 100644
--- a/examples/solids/problems/problems.h
+++ b/examples/solids/problems/problems.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -35,9 +35,5 @@ PetscErrorCode RegisterProblems(ProblemFunctions problem_functions);
                                           PetscInt u_loc_size, CeedVector fine_mult, CeedData *data);
 
 SOLIDS_PROBLEM(ElasLinear);
-SOLIDS_PROBLEM(ElasSSNH);
-SOLIDS_PROBLEM(ElasFSCurrentNH1);
-SOLIDS_PROBLEM(ElasFSCurrentNH2);
-SOLIDS_PROBLEM(ElasFSInitialNH1);
-SOLIDS_PROBLEM(ElasFSInitialNH2);
-SOLIDS_PROBLEM(ElasFSInitialMR1);
+SOLIDS_PROBLEM(ElasFSNH);
+SOLIDS_PROBLEM(ElasFSMR);
diff --git a/examples/solids/qfunctions/common.h b/examples/solids/qfunctions/common.h
index bfdb92522f..cf63c02a93 100644
--- a/examples/solids/qfunctions/common.h
+++ b/examples/solids/qfunctions/common.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,7 +8,7 @@
 /// @file
 /// Geometric factors for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 // -----------------------------------------------------------------------------
 // This QFunction sets up the geometric factors required for integration and coordinate transformations
diff --git a/examples/solids/qfunctions/constant-force.h b/examples/solids/qfunctions/constant-force.h
index a94dc4f3bf..e37505c7e4 100644
--- a/examples/solids/qfunctions/constant-force.h
+++ b/examples/solids/qfunctions/constant-force.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// Constant forcing term for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #ifndef PHYSICS_STRUCT
 #define PHYSICS_STRUCT
diff --git a/examples/solids/qfunctions/finite-strain-mooney-rivlin-initial-1.h b/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
similarity index 95%
rename from examples/solids/qfunctions/finite-strain-mooney-rivlin-initial-1.h
rename to examples/solids/qfunctions/finite-strain-mooney-rivlin.h
index 444b71d27f..9fc34b9ff5 100644
--- a/examples/solids/qfunctions/finite-strain-mooney-rivlin-initial-1.h
+++ b/examples/solids/qfunctions/finite-strain-mooney-rivlin.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// Hyperelasticity, finite strain for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // Mooney-Rivlin context
@@ -91,14 +93,14 @@ CEED_QFUNCTION_HELPER int computeMatinvSym(const CeedScalar A[3][3], const CeedS
   };
   for (CeedInt m = 0; m < 6; m++) Ainv[m] = B[m] / (detA);
 
-  return 0;
-};
+  return CEED_ERROR_SUCCESS;
+}
 #endif
 // -----------------------------------------------------------------------------
 // Common computations between FS and dFS
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION_HELPER int commonFSMR1(const CeedScalar mu_1, const CeedScalar mu_2, const CeedScalar lambda, const CeedScalar grad_u[3][3],
-                                      CeedScalar Swork[6], CeedScalar Cwork[6], CeedScalar Cinvwork[6], CeedScalar *logJ) {
+CEED_QFUNCTION_HELPER int commonFSMR(const CeedScalar mu_1, const CeedScalar mu_2, const CeedScalar lambda, const CeedScalar grad_u[3][3],
+                                     CeedScalar Swork[6], CeedScalar Cwork[6], CeedScalar Cinvwork[6], CeedScalar *logJ) {
   // E - Green-Lagrange strain tensor
   //     E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u)
   const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
@@ -147,13 +149,13 @@ CEED_QFUNCTION_HELPER int commonFSMR1(const CeedScalar mu_1, const CeedScalar mu
                - mu_2 * Cwork[i];
   }
 
-  return 0;
-};
+  return CEED_ERROR_SUCCESS;
+}
 
 // -----------------------------------------------------------------------------
 // Residual evaluation for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialMR1F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSResidual_MR)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -218,7 +220,7 @@ CEED_QFUNCTION(ElasFSInitialMR1F)(void *ctx, CeedInt Q, const CeedScalar *const
 
     // Common components of finite strain calculations
     CeedScalar Swork[6], Cwork[6], Cinvwork[6], logJ;
-    commonFSMR1(mu_1, mu_2, lambda, tempgradu, Swork, Cwork, Cinvwork, &logJ);
+    commonFSMR(mu_1, mu_2, lambda, tempgradu, Swork, Cwork, Cinvwork, &logJ);
 
     // Second Piola-Kirchhoff (S)
     const CeedScalar S[3][3] = {
@@ -245,13 +247,13 @@ CEED_QFUNCTION(ElasFSInitialMR1F)(void *ctx, CeedInt Q, const CeedScalar *const
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Jacobian evaluation for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialMR1dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSJacobian_MR)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
         (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
@@ -311,7 +313,7 @@ CEED_QFUNCTION(ElasFSInitialMR1dF)(void *ctx, CeedInt Q, const CeedScalar *const
 
     // Common components of finite strain calculations
     CeedScalar Swork[6], Cwork[6], Cinvwork[6], logJ;
-    commonFSMR1(mu_1, mu_2, lambda, tempgradu, Swork, Cwork, Cinvwork, &logJ);
+    commonFSMR(mu_1, mu_2, lambda, tempgradu, Swork, Cwork, Cinvwork, &logJ);
 
     // dE - Green-Lagrange strain tensor
     const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
@@ -408,12 +410,13 @@ CEED_QFUNCTION(ElasFSInitialMR1dF)(void *ctx, CeedInt Q, const CeedScalar *const
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
+
 // -----------------------------------------------------------------------------
 // Strain energy computation for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialMR1Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSEnergy_MR)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -498,13 +501,13 @@ CEED_QFUNCTION(ElasFSInitialMR1Energy)(void *ctx, CeedInt Q, const CeedScalar *c
 
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Nodal diagnostic quantities for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialMR1Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSDiagnostic_MR)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
         (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
@@ -605,6 +608,6 @@ CEED_QFUNCTION(ElasFSInitialMR1Diagnostic)(void *ctx, CeedInt Q, const CeedScala
     diagnostic[7][i] = (0.5 * lambda * logJ * logJ - (mu_1 + 2 * mu_2) * logJ + (mu_1 / 2.) * (I_1 - 3) + (mu_2 / 2.) * (I_2 - 3));
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 // -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean-current-1.h b/examples/solids/qfunctions/finite-strain-neo-hookean-current-1.h
deleted file mode 100644
index 129a0af3b4..0000000000
--- a/examples/solids/qfunctions/finite-strain-neo-hookean-current-1.h
+++ /dev/null
@@ -1,535 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Hyperelasticity, finite strain for solid mechanics example using PETSc
-
-#include <ceed.h>
-#include <math.h>
-
-#ifndef PHYSICS_STRUCT
-#define PHYSICS_STRUCT
-typedef struct Physics_private *Physics;
-struct Physics_private {
-  CeedScalar nu;  // Poisson's ratio
-  CeedScalar E;   // Young's Modulus
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Series approximation of log1p()
-//  log1p() is not vectorized in libc
-//
-//  The series expansion is accurate to 1e-7 in the range sqrt(2)/2 < J < sqrt(2), with machine precision accuracy near J=1.
-//  The initialization extends this range to 0.35 ~= sqrt(2)/4 < J < sqrt(2)*2 ~= 2.83, which should be sufficient for applications of the Neo-Hookean
-//  model.
-// -----------------------------------------------------------------------------
-#ifndef LOG1P_SERIES_SHIFTED
-#define LOG1P_SERIES_SHIFTED
-CEED_QFUNCTION_HELPER CeedScalar log1p_series_shifted(CeedScalar x) {
-  const CeedScalar left = sqrt(2.) / 2 - 1, right = sqrt(2.) - 1;
-  CeedScalar       sum = 0;
-  if (1) {           // Disable if the smaller range sqrt(2)/2 < J < sqrt(2) is sufficient
-    if (x < left) {  // Replace if with while for arbitrary range (may hurt vectorization)
-      sum -= log(2.) / 2;
-      x = 1 + 2 * x;
-    } else if (right < x) {
-      sum += log(2.) / 2;
-      x = (x - 1) / 2;
-    }
-  }
-  CeedScalar       y  = x / (2. + x);
-  const CeedScalar y2 = y * y;
-  sum += y;
-  y *= y2;
-  sum += y / 3;
-  y *= y2;
-  sum += y / 5;
-  y *= y2;
-  sum += y / 7;
-  return 2 * sum;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Compute det F - 1
-// -----------------------------------------------------------------------------
-#ifndef DETJM1
-#define DETJM1
-CEED_QFUNCTION_HELPER CeedScalar computeJM1(const CeedScalar grad_u[3][3]) {
-  return grad_u[0][0] * (grad_u[1][1] * grad_u[2][2] - grad_u[1][2] * grad_u[2][1]) +
-         grad_u[0][1] * (grad_u[1][2] * grad_u[2][0] - grad_u[1][0] * grad_u[2][2]) +
-         grad_u[0][2] * (grad_u[1][0] * grad_u[2][1] - grad_u[2][0] * grad_u[1][1]) + grad_u[0][0] + grad_u[1][1] + grad_u[2][2] +
-         grad_u[0][0] * grad_u[1][1] + grad_u[0][0] * grad_u[2][2] + grad_u[1][1] * grad_u[2][2] - grad_u[0][1] * grad_u[1][0] -
-         grad_u[0][2] * grad_u[2][0] - grad_u[1][2] * grad_u[2][1];
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Compute matrix^(-1), where matrix is nonsymetric, returns array of 9
-// -----------------------------------------------------------------------------
-#ifndef MatinvNonSym
-#define MatinvNonSym
-CEED_QFUNCTION_HELPER int computeMatinvNonSym(const CeedScalar A[3][3], const CeedScalar detA, CeedScalar Ainv[9]) {
-  // Compute A^(-1) : A-Inverse
-  CeedScalar B[9] = {
-      A[1][1] * A[2][2] - A[1][2] * A[2][1], /* *NOPAD* */
-      A[0][0] * A[2][2] - A[0][2] * A[2][0], /* *NOPAD* */
-      A[0][0] * A[1][1] - A[0][1] * A[1][0], /* *NOPAD* */
-      A[0][2] * A[1][0] - A[0][0] * A[1][2], /* *NOPAD* */
-      A[0][1] * A[1][2] - A[0][2] * A[1][1], /* *NOPAD* */
-      A[0][2] * A[2][1] - A[0][1] * A[2][2], /* *NOPAD* */
-      A[0][1] * A[2][0] - A[0][0] * A[2][1], /* *NOPAD* */
-      A[1][0] * A[2][1] - A[1][1] * A[2][0], /* *NOPAD* */
-      A[1][2] * A[2][0] - A[1][0] * A[2][2]  /* *NOPAD* */
-  };
-  for (CeedInt m = 0; m < 9; m++) Ainv[m] = B[m] / (detA);
-
-  return 0;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Common computations between Ftau and dFtau
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION_HELPER int commonFtau(const CeedScalar lambda, const CeedScalar mu, const CeedScalar Grad_u[3][3], CeedScalar Finv[3][3],
-                                     CeedScalar tau_work[6], CeedScalar *llnj) {
-  // Compute The Deformation Gradient : F = I3 + Grad_u
-  const CeedScalar F[3][3] = {
-      {Grad_u[0][0] + 1, Grad_u[0][1],     Grad_u[0][2]    },
-      {Grad_u[1][0],     Grad_u[1][1] + 1, Grad_u[1][2]    },
-      {Grad_u[2][0],     Grad_u[2][1],     Grad_u[2][2] + 1}
-  };
-
-  // b - I3 = (Grad_u + Grad_u^T + Grad_u*Grad_u^T)
-  const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-  CeedScalar    bMI3[6];
-  for (CeedInt m = 0; m < 6; m++) {
-    bMI3[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]];
-    for (CeedInt n = 0; n < 3; n++) bMI3[m] += Grad_u[indj[m]][n] * Grad_u[indk[m]][n];
-  }
-  const CeedScalar Jm1  = computeJM1(Grad_u);
-  const CeedScalar logJ = log1p_series_shifted(Jm1);
-
-  // Computer F^(-1)
-  const CeedScalar detF = Jm1 + 1.;
-  CeedScalar       Finvwork[9];
-  computeMatinvNonSym(F, detF, Finvwork);
-
-  Finv[0][0] = Finvwork[0];
-  Finv[0][1] = Finvwork[5];
-  Finv[0][2] = Finvwork[4];
-  Finv[1][0] = Finvwork[8];
-  Finv[1][1] = Finvwork[1];
-  Finv[1][2] = Finvwork[3];
-  Finv[2][0] = Finvwork[7];
-  Finv[2][1] = Finvwork[6];
-  Finv[2][2] = Finvwork[2];
-
-  // Compute the Kirchhoff stress (tau) tau = mu*(b - I3) + lambda*log(J)*I3
-  *llnj = lambda * logJ;
-
-  tau_work[0] = mu * bMI3[0] + *llnj;
-  tau_work[1] = mu * bMI3[1] + *llnj;
-  tau_work[2] = mu * bMI3[2] + *llnj;
-  tau_work[3] = mu * bMI3[3];
-  tau_work[4] = mu * bMI3[4];
-  tau_work[5] = mu * bMI3[5];
-
-  return 0;
-};
-
-// -----------------------------------------------------------------------------
-// Residual evaluation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH1F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*dvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-  // Store grad_u for HyperFSdF (Jacobian of HyperFSF)
-  CeedScalar(*Grad_u)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[1];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Formulation Terminology:
-  //  I3    : 3x3 Identity matrix
-  //  b     : left Cauchy-Green tensor
-  //  binv  : inverse of b
-  //  F     : deformation gradient
-  //  tau   : Kirchhoff stress (in current config)
-  // Formulation:
-  //  F =  I3 + Grad_u
-  //  J = det(F)
-  //  b = F*F(^T)
-  //  tau = mu*(b - I3) + lambda*log(J)*I3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ = q_data[0][i];
-    // dXdx_initial = dX/dx_initial
-    // X is natural coordinate sys OR Reference [-1,1]^dim
-    // x_initial is initial config coordinate system
-    const CeedScalar dXdx_initial[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute Grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        Grad_u[j][k][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) Grad_u[j][k][i] += du[j][m] * dXdx_initial[m][k];
-      }
-    }
-
-    const CeedScalar tempGradu[3][3] = {
-        {Grad_u[0][0][i], Grad_u[0][1][i], Grad_u[0][2][i]},
-        {Grad_u[1][0][i], Grad_u[1][1][i], Grad_u[1][2][i]},
-        {Grad_u[2][0][i], Grad_u[2][1][i], Grad_u[2][2][i]}
-    };
-
-    // Common components of finite strain calculations
-    CeedScalar Finv[3][3], tau_work[6], llnj;
-
-    commonFtau(lambda, mu, tempGradu, Finv, tau_work, &llnj);
-    const CeedScalar tau[3][3] = {
-        {tau_work[0], tau_work[5], tau_work[4]},
-        {tau_work[5], tau_work[1], tau_work[3]},
-        {tau_work[4], tau_work[3], tau_work[2]}
-    };
-    // x is current config coordinate system
-    // dXdx = dX/dx = dX/dx_initial * F^{-1}
-    // Note that F^{-1} = dx_initial/dx
-    CeedScalar dXdx[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        dXdx[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) dXdx[j][k] += dXdx_initial[j][m] * Finv[m][k];
-      }
-    }
-
-    // Apply dXdx^T and weight to intermediate stress
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        dvdX[k][j][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) dvdX[k][j][i] += dXdx[k][m] * tau[j][m] * wdetJ;
-      }
-    }
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Jacobian evaluation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH1dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
-        (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-  // F is used for hyperelasticity (non-linear)
-  const CeedScalar(*Grad_u)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2];
-
-  // Outputs
-  CeedScalar(*deltadvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar mu     = TwoMu / 2;
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of delta_u
-    const CeedScalar deltadu[3][3] = {
-        {deltaug[0][0][i], deltaug[1][0][i], deltaug[2][0][i]},
-        {deltaug[0][1][i], deltaug[1][1][i], deltaug[2][1][i]},
-        {deltaug[0][2][i], deltaug[1][2][i], deltaug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ = q_data[0][i];
-    // dXdx_initial = dX/dx_initial
-    // X is natural coordinate sys OR Reference [-1,1]^dim
-    // x_initial is initial config coordinate system
-    const CeedScalar dXdx_initial[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute Grad_du
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to deltadu = graddelta
-    // This is dF = Grad_du
-    CeedScalar Grad_du[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        Grad_du[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) Grad_du[j][k] += dXdx_initial[m][k] * deltadu[j][m];
-      }
-    }
-
-    const CeedScalar tempGradu[3][3] = {
-        {Grad_u[0][0][i], Grad_u[0][1][i], Grad_u[0][2][i]},
-        {Grad_u[1][0][i], Grad_u[1][1][i], Grad_u[1][2][i]},
-        {Grad_u[2][0][i], Grad_u[2][1][i], Grad_u[2][2][i]}
-    };
-
-    // Common components of finite strain calculations
-    CeedScalar F_inv[3][3], tau_work[6], llnj;
-
-    // Common components of finite strain calculations (cur. config.)
-    commonFtau(lambda, mu, tempGradu, F_inv, tau_work, &llnj);
-    const CeedScalar tau[3][3] = {
-        {tau_work[0], tau_work[5], tau_work[4]},
-        {tau_work[5], tau_work[1], tau_work[3]},
-        {tau_work[4], tau_work[3], tau_work[2]}
-    };
-
-    // Compute grad_du = \nabla_x (deltau) = deltau * dX/dx
-    CeedScalar grad_du[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        grad_du[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_du[j][k] += deltadu[j][m] * F_inv[m][k];
-      }
-    }
-
-    // Compute grad_du_tau = grad_du*tau
-    CeedScalar grad_du_tau[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        grad_du_tau[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_du_tau[j][k] += grad_du[j][m] * tau[m][k];
-      }
-    }
-
-    // Compute depsilon = (grad_du + grad_du^T)/2
-    const CeedScalar depsilon[3][3] = {
-        {(grad_du[0][0] + grad_du[0][0]) / 2., (grad_du[0][1] + grad_du[1][0]) / 2., (grad_du[0][2] + grad_du[2][0]) / 2.},
-        {(grad_du[1][0] + grad_du[0][1]) / 2., (grad_du[1][1] + grad_du[1][1]) / 2., (grad_du[1][2] + grad_du[2][1]) / 2.},
-        {(grad_du[2][0] + grad_du[0][2]) / 2., (grad_du[2][1] + grad_du[1][2]) / 2., (grad_du[2][2] + grad_du[2][2]) / 2.}
-    };
-    // Compute trace(depsilon)
-    CeedScalar tr_deps = depsilon[0][0] + depsilon[1][1] + depsilon[2][2];
-    // Compute grad_du*tau + trace(depsilon)I3
-    grad_du_tau[0][0] += lambda * tr_deps;
-    grad_du_tau[1][1] += lambda * tr_deps;
-    grad_du_tau[2][2] += lambda * tr_deps;
-    // Compute dp = grad_du*tau + trace(depsilon)I3 +2(mu-lambda*logJ)depsilon
-    CeedScalar dp[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        dp[j][k] = grad_du_tau[j][k] + 2 * (mu - llnj) * depsilon[j][k];
-      }
-    }
-
-    // x is current config coordinate system
-    // dXdx = dX/dx = dX/dx_initial * F^{-1}
-    // Note that F^{-1} = dx_initial/dx
-    CeedScalar dXdx[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        dXdx[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) dXdx[j][k] += dXdx_initial[j][m] * F_inv[m][k];
-      }
-    }
-
-    // Apply dXdx^T and weight
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        deltadvdX[k][j][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) deltadvdX[k][j][i] += dXdx[k][m] * dp[j][m] * wdetJ;
-      }
-    }
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Strain energy computation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH1Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*energy) = (CeedScalar(*))out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute Grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar Grad_u[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        Grad_u[j][k] = 0;
-        for (int m = 0; m < 3; m++) Grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (Grad_u + Grad_u^T + Grad_u^T*Grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += Grad_u[n][indj[m]] * Grad_u[n][indk[m]];
-    }
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-
-    // Strain energy Phi(E) for compressible Neo-Hookean
-    const CeedScalar Jm1  = computeJM1(Grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-    energy[i]             = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.) * wdetJ;
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Nodal diagnostic quantities for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH1Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
-        (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
-
-  // Outputs
-  CeedScalar(*diagnostic)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute Grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = Grad_u
-    CeedScalar Grad_u[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        Grad_u[j][k] = 0;
-        for (int m = 0; m < 3; m++) Grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (Grad_u + Grad_u^T + Grad_u^T*Grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += Grad_u[n][indj[m]] * Grad_u[n][indk[m]];
-    }
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-
-    // Displacement
-    diagnostic[0][i] = u[0][i];
-    diagnostic[1][i] = u[1][i];
-    diagnostic[2][i] = u[2][i];
-
-    // Pressure
-    const CeedScalar Jm1  = computeJM1(Grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-    diagnostic[3][i]      = -lambda * logJ;
-
-    // Stress tensor invariants
-    diagnostic[4][i] = (E2[0][0] + E2[1][1] + E2[2][2]) / 2.;
-    diagnostic[5][i] = 0.;
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt m = 0; m < 3; m++) diagnostic[5][i] += E2[j][m] * E2[m][j] / 4.;
-    }
-    diagnostic[6][i] = Jm1 + 1.;
-
-    // Strain energy
-    diagnostic[7][i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.);
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-// -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean-current-2.h b/examples/solids/qfunctions/finite-strain-neo-hookean-current-2.h
deleted file mode 100644
index b03334f999..0000000000
--- a/examples/solids/qfunctions/finite-strain-neo-hookean-current-2.h
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Hyperelasticity, finite strain for solid mechanics example using PETSc
-
-#include <ceed.h>
-#include <math.h>
-
-#ifndef PHYSICS_STRUCT
-#define PHYSICS_STRUCT
-typedef struct Physics_private *Physics;
-struct Physics_private {
-  CeedScalar nu;  // Poisson's ratio
-  CeedScalar E;   // Young's Modulus
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Series approximation of log1p()
-//  log1p() is not vectorized in libc
-//
-//  The series expansion is accurate to 1e-7 in the range sqrt(2)/2 < J < sqrt(2), with machine precision accuracy near J=1.
-//  The initialization extends this range to 0.35 ~= sqrt(2)/4 < J < sqrt(2)*2 ~= 2.83, which should be sufficient for applications of the Neo-Hookean
-//  model.
-// -----------------------------------------------------------------------------
-#ifndef LOG1P_SERIES_SHIFTED
-#define LOG1P_SERIES_SHIFTED
-CEED_QFUNCTION_HELPER CeedScalar log1p_series_shifted(CeedScalar x) {
-  const CeedScalar left = sqrt(2.) / 2 - 1, right = sqrt(2.) - 1;
-  CeedScalar       sum = 0;
-  if (1) {           // Disable if the smaller range sqrt(2)/2 < J < sqrt(2) is sufficient
-    if (x < left) {  // Replace if with while for arbitrary range (may hurt vectorization)
-      sum -= log(2.) / 2;
-      x = 1 + 2 * x;
-    } else if (right < x) {
-      sum += log(2.) / 2;
-      x = (x - 1) / 2;
-    }
-  }
-  CeedScalar       y  = x / (2. + x);
-  const CeedScalar y2 = y * y;
-  sum += y;
-  y *= y2;
-  sum += y / 3;
-  y *= y2;
-  sum += y / 5;
-  y *= y2;
-  sum += y / 7;
-  return 2 * sum;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Compute det F - 1
-// -----------------------------------------------------------------------------
-#ifndef DETJM1
-#define DETJM1
-CEED_QFUNCTION_HELPER CeedScalar computeJM1(const CeedScalar grad_u[3][3]) {
-  return grad_u[0][0] * (grad_u[1][1] * grad_u[2][2] - grad_u[1][2] * grad_u[2][1]) +
-         grad_u[0][1] * (grad_u[1][2] * grad_u[2][0] - grad_u[1][0] * grad_u[2][2]) +
-         grad_u[0][2] * (grad_u[1][0] * grad_u[2][1] - grad_u[2][0] * grad_u[1][1]) + grad_u[0][0] + grad_u[1][1] + grad_u[2][2] +
-         grad_u[0][0] * grad_u[1][1] + grad_u[0][0] * grad_u[2][2] + grad_u[1][1] * grad_u[2][2] - grad_u[0][1] * grad_u[1][0] -
-         grad_u[0][2] * grad_u[2][0] - grad_u[1][2] * grad_u[2][1];
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Compute matrix^(-1), where matrix is nonsymetric, returns array of 9
-// -----------------------------------------------------------------------------
-#ifndef MatinvNonSym
-#define MatinvNonSym
-CEED_QFUNCTION_HELPER int computeMatinvNonSym(const CeedScalar A[3][3], const CeedScalar detA, CeedScalar Ainv[9]) {
-  // Compute A^(-1) : A-Inverse
-  CeedScalar B[9] = {
-      A[1][1] * A[2][2] - A[1][2] * A[2][1], /* *NOPAD* */
-      A[0][0] * A[2][2] - A[0][2] * A[2][0], /* *NOPAD* */
-      A[0][0] * A[1][1] - A[0][1] * A[1][0], /* *NOPAD* */
-      A[0][2] * A[1][0] - A[0][0] * A[1][2], /* *NOPAD* */
-      A[0][1] * A[1][2] - A[0][2] * A[1][1], /* *NOPAD* */
-      A[0][2] * A[2][1] - A[0][1] * A[2][2], /* *NOPAD* */
-      A[0][1] * A[2][0] - A[0][0] * A[2][1], /* *NOPAD* */
-      A[1][0] * A[2][1] - A[1][1] * A[2][0], /* *NOPAD* */
-      A[1][2] * A[2][0] - A[1][0] * A[2][2]  /* *NOPAD* */
-  };
-  for (CeedInt m = 0; m < 9; m++) Ainv[m] = B[m] / (detA);
-
-  return 0;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Residual evaluation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH2F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*dvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-  // Store dXdx
-  CeedScalar(*dXdx)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[1];
-  // Store tau
-  CeedScalar(*tau)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[2];
-  // Store constant lam_log_J = lambda*log(J)
-  CeedScalar(*lam_log_J)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[3];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Formulation Terminology:
-  //  I3    : 3x3 Identity matrix
-  //  b     : left Cauchy-Green tensor
-  //  F     : deformation gradient
-  //  tau   : Kirchhoff stress (in current config)
-  // Formulation:
-  //  F =  I3 + Grad_ue
-  //  J = det(F)
-  //  b = F*F^{T}
-  //  tau = mu*b - (mu - lambda*log(J))*I3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ = q_data[0][i];
-    // dXdx_initial = dX/dx_initial
-    // X is natural coordinate sys OR Reference [-1,1]^dim
-    // x_initial is initial config coordinate system
-    const CeedScalar dXdx_initial[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // X is natural coordinate sys OR Reference system
-    // x_initial is initial config coordinate system
-    // Grad_u =du/dx_initial= du/dX * dX/dx_initial
-    CeedScalar Grad_u[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        Grad_u[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) Grad_u[j][k] += du[j][m] * dXdx_initial[m][k];
-      }
-    }
-
-    // Compute The Deformation Gradient : F = I3 + Gradu
-    const CeedScalar F[3][3] = {
-        {Grad_u[0][0] + 1, Grad_u[0][1],     Grad_u[0][2]    },
-        {Grad_u[1][0],     Grad_u[1][1] + 1, Grad_u[1][2]    },
-        {Grad_u[2][0],     Grad_u[2][1],     Grad_u[2][2] + 1}
-    };
-
-    // b - I3 = (Grad_u + Grad_u^T + Grad_u*Grad_u^T)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    bMI3[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      bMI3[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) bMI3[m] += Grad_u[indj[m]][n] * Grad_u[indk[m]][n];
-    }
-
-    const CeedScalar Jm1  = computeJM1(Grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-
-    // store lam_log_J = lambda*log(J)
-    lam_log_J[0][i] = lambda * logJ;
-
-    // tau = mu*b - Cc1*I3;
-    tau[0][i] = mu * bMI3[0] + lam_log_J[0][i];
-    tau[1][i] = mu * bMI3[1] + lam_log_J[0][i];
-    tau[2][i] = mu * bMI3[2] + lam_log_J[0][i];
-    tau[3][i] = mu * bMI3[3];
-    tau[4][i] = mu * bMI3[4];
-    tau[5][i] = mu * bMI3[5];
-
-    // Computer F^{-1}
-    const CeedScalar detF = Jm1 + 1.;
-    CeedScalar       Finvwork[9];
-    computeMatinvNonSym(F, detF, Finvwork);
-    CeedScalar Finv[3][3];
-    Finv[0][0] = Finvwork[0];
-    Finv[0][1] = Finvwork[5];
-    Finv[0][2] = Finvwork[4];
-    Finv[1][0] = Finvwork[8];
-    Finv[1][1] = Finvwork[1];
-    Finv[1][2] = Finvwork[3];
-    Finv[2][0] = Finvwork[7];
-    Finv[2][1] = Finvwork[6];
-    Finv[2][2] = Finvwork[2];
-
-    // x is current config coordinate system
-    // dXdx = dX/dx = dX/dx_initial * F^{-1}
-    // Note that F^{-1} = dx_initial/dx
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        dXdx[j][k][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) dXdx[j][k][i] += dXdx_initial[j][m] * Finv[m][k];
-      }
-    }
-
-    const CeedScalar temptau[3][3] = {
-        {tau[0][i], tau[5][i], tau[4][i]},
-        {tau[5][i], tau[1][i], tau[3][i]},
-        {tau[4][i], tau[3][i], tau[2][i]}
-    };
-    // Apply dXdx^T and weight to intermediate stress
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        dvdX[k][j][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) dvdX[k][j][i] += dXdx[k][m][i] * temptau[j][m] * wdetJ;
-      }
-    }
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Jacobian evaluation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH2dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
-        (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-  // dXdx computed in residual
-  const CeedScalar(*dXdx)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2];
-  // tau computed in residual
-  const CeedScalar(*tau)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3];
-  // lam_log_J = lambda*log(J) computed in residual
-  const CeedScalar(*lam_log_J)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[4];
-  // Outputs
-  CeedScalar(*deltadvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar mu     = TwoMu / 2;
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of delta_u
-    const CeedScalar deltadu[3][3] = {
-        {deltaug[0][0][i], deltaug[1][0][i], deltaug[2][0][i]},
-        {deltaug[0][1][i], deltaug[1][1][i], deltaug[2][1][i]},
-        {deltaug[0][2][i], deltaug[1][2][i], deltaug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ = q_data[0][i];
-
-    // Compute grad_du = \nabla_x (deltau) = deltau * dX/dx
-    CeedScalar grad_du[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        grad_du[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_du[j][k] += deltadu[j][m] * dXdx[m][k][i];
-      }
-    }
-
-    const CeedScalar temptau[3][3] = {
-        {tau[0][i], tau[5][i], tau[4][i]},
-        {tau[5][i], tau[1][i], tau[3][i]},
-        {tau[4][i], tau[3][i], tau[2][i]}
-    };
-
-    // Compute grad_du_tau = grad_du*tau
-    CeedScalar grad_du_tau[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        grad_du_tau[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_du_tau[j][k] += grad_du[j][m] * temptau[m][k];
-      }
-    }
-
-    // Compute depsilon = (grad_du + grad_du^T)/2
-    const CeedScalar depsilon[3][3] = {
-        {(grad_du[0][0] + grad_du[0][0]) / 2., (grad_du[0][1] + grad_du[1][0]) / 2., (grad_du[0][2] + grad_du[2][0]) / 2.},
-        {(grad_du[1][0] + grad_du[0][1]) / 2., (grad_du[1][1] + grad_du[1][1]) / 2., (grad_du[1][2] + grad_du[2][1]) / 2.},
-        {(grad_du[2][0] + grad_du[0][2]) / 2., (grad_du[2][1] + grad_du[1][2]) / 2., (grad_du[2][2] + grad_du[2][2]) / 2.}
-    };
-    // Compute trace(depsilon)
-    CeedScalar tr_deps = depsilon[0][0] + depsilon[1][1] + depsilon[2][2];
-    // Compute grad_du*tau + trace(depsilon)I3
-    grad_du_tau[0][0] += lambda * tr_deps;
-    grad_du_tau[1][1] += lambda * tr_deps;
-    grad_du_tau[2][2] += lambda * tr_deps;
-    // Compute dp = grad_du*tau + trace(depsilon)I3 +2(mu-lambda*logJ)depsilon
-    CeedScalar dp[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        dp[j][k] = grad_du_tau[j][k] + 2 * (mu - lam_log_J[0][i]) * depsilon[j][k];
-      }
-    }
-
-    // Apply dXdx^T and weight
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        deltadvdX[k][j][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) deltadvdX[k][j][i] += dXdx[k][m][i] * dp[j][m] * wdetJ;
-      }
-    }
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Strain energy computation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH2Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*energy) = (CeedScalar(*))out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute Grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar Grad_u[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        Grad_u[j][k] = 0;
-        for (int m = 0; m < 3; m++) Grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (Grad_u + Grad_u^T + Grad_u^T*Grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += Grad_u[n][indj[m]] * Grad_u[n][indk[m]];
-    }
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-    const CeedScalar Jm1  = computeJM1(Grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-
-    // Strain energy Phi(E) for compressible Neo-Hookean
-    energy[i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.) * wdetJ;
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Nodal diagnostic quantities for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSCurrentNH2Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
-        (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
-
-  // Outputs
-  CeedScalar(*diagnostic)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute Grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar Grad_u[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        Grad_u[j][k] = 0;
-        for (int m = 0; m < 3; m++) Grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (Grad_u + Grad_u^T + Grad_u^T*Grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = Grad_u[indj[m]][indk[m]] + Grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += Grad_u[n][indj[m]] * Grad_u[n][indk[m]];
-    }
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-
-    // Displacement
-    diagnostic[0][i] = u[0][i];
-    diagnostic[1][i] = u[1][i];
-    diagnostic[2][i] = u[2][i];
-
-    // Pressure
-    const CeedScalar Jm1  = computeJM1(Grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-    diagnostic[3][i]      = -lambda * logJ;
-
-    // Stress tensor invariants
-    diagnostic[4][i] = (E2[0][0] + E2[1][1] + E2[2][2]) / 2.;
-    diagnostic[5][i] = 0.;
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt m = 0; m < 3; m++) diagnostic[5][i] += E2[j][m] * E2[m][j] / 4.;
-    }
-    diagnostic[6][i] = Jm1 + 1.;
-
-    // Strain energy
-    diagnostic[7][i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.);
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-// -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean-initial-2.h b/examples/solids/qfunctions/finite-strain-neo-hookean-initial-2.h
deleted file mode 100644
index 09c4bb99ce..0000000000
--- a/examples/solids/qfunctions/finite-strain-neo-hookean-initial-2.h
+++ /dev/null
@@ -1,559 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Hyperelasticity, finite strain for solid mechanics example using PETSc
-
-#include <ceed.h>
-#include <math.h>
-
-#ifndef PHYSICS_STRUCT
-#define PHYSICS_STRUCT
-typedef struct Physics_private *Physics;
-struct Physics_private {
-  CeedScalar nu;  // Poisson's ratio
-  CeedScalar E;   // Young's Modulus
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Series approximation of log1p()
-//  log1p() is not vectorized in libc
-//
-//  The series expansion is accurate to 1e-7 in the range sqrt(2)/2 < J < sqrt(2), with machine precision accuracy near J=1.
-//  The initialization extends this range to 0.35 ~= sqrt(2)/4 < J < sqrt(2)*2 ~= 2.83, which should be sufficient for applications of the Neo-Hookean
-//  model.
-// -----------------------------------------------------------------------------
-#ifndef LOG1P_SERIES_SHIFTED
-#define LOG1P_SERIES_SHIFTED
-CEED_QFUNCTION_HELPER CeedScalar log1p_series_shifted(CeedScalar x) {
-  const CeedScalar left = sqrt(2.) / 2 - 1, right = sqrt(2.) - 1;
-  CeedScalar       sum = 0;
-  if (1) {           // Disable if the smaller range sqrt(2)/2 < J < sqrt(2) is sufficient
-    if (x < left) {  // Replace if with while for arbitrary range (may hurt vectorization)
-      sum -= log(2.) / 2;
-      x = 1 + 2 * x;
-    } else if (right < x) {
-      sum += log(2.) / 2;
-      x = (x - 1) / 2;
-    }
-  }
-  CeedScalar       y  = x / (2. + x);
-  const CeedScalar y2 = y * y;
-  sum += y;
-  y *= y2;
-  sum += y / 3;
-  y *= y2;
-  sum += y / 5;
-  y *= y2;
-  sum += y / 7;
-  return 2 * sum;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Compute det F - 1
-// -----------------------------------------------------------------------------
-#ifndef DETJM1
-#define DETJM1
-CEED_QFUNCTION_HELPER CeedScalar computeJM1(const CeedScalar grad_u[3][3]) {
-  return grad_u[0][0] * (grad_u[1][1] * grad_u[2][2] - grad_u[1][2] * grad_u[2][1]) +
-         grad_u[0][1] * (grad_u[1][2] * grad_u[2][0] - grad_u[1][0] * grad_u[2][2]) +
-         grad_u[0][2] * (grad_u[1][0] * grad_u[2][1] - grad_u[2][0] * grad_u[1][1]) + grad_u[0][0] + grad_u[1][1] + grad_u[2][2] +
-         grad_u[0][0] * grad_u[1][1] + grad_u[0][0] * grad_u[2][2] + grad_u[1][1] * grad_u[2][2] - grad_u[0][1] * grad_u[1][0] -
-         grad_u[0][2] * grad_u[2][0] - grad_u[1][2] * grad_u[2][1];
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Compute matrix^(-1), where matrix is symetric, returns array of 6
-// -----------------------------------------------------------------------------
-#ifndef MatinvSym
-#define MatinvSym
-CEED_QFUNCTION_HELPER int computeMatinvSym(const CeedScalar A[3][3], const CeedScalar detA, CeedScalar Ainv[6]) {
-  // Compute A^(-1) : A-Inverse
-  CeedScalar B[6] = {
-      A[1][1] * A[2][2] - A[1][2] * A[2][1], /* *NOPAD* */
-      A[0][0] * A[2][2] - A[0][2] * A[2][0], /* *NOPAD* */
-      A[0][0] * A[1][1] - A[0][1] * A[1][0], /* *NOPAD* */
-      A[0][2] * A[1][0] - A[0][0] * A[1][2], /* *NOPAD* */
-      A[0][1] * A[1][2] - A[0][2] * A[1][1], /* *NOPAD* */
-      A[0][2] * A[2][1] - A[0][1] * A[2][2]  /* *NOPAD* */
-  };
-  for (CeedInt m = 0; m < 6; m++) Ainv[m] = B[m] / (detA);
-
-  return 0;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Residual evaluation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH2F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*dvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-  // Store grad_u for HyperFSdF (Jacobian of HyperFSF)
-  CeedScalar(*grad_u)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[1];
-  // Store C_inv for HyperFSdF (Jacobian of HyperFSF)
-  CeedScalar(*C_inv)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[2];
-  // Store constant lam_log_J = lambda*log(J)
-  CeedScalar(*lam_log_J)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[3];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Formulation Terminology:
-  //  I3    : 3x3 Identity matrix
-  //  C     : right Cauchy-Green tensor
-  //  C_inv : inverse of C
-  //  F     : deformation gradient
-  //  S     : 2nd Piola-Kirchhoff (in current config)
-  //  P     : 1st Piola-Kirchhoff (in referential config)
-  // Formulation:
-  //  F =  I3 + grad_ue
-  //  J = det(F)
-  //  C = F(^T)*F
-  //  S = mu*I3 + (lambda*log(J)-mu)*C_inv;
-  //  P = F*S
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        grad_u[j][k][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_u[j][k][i] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // I3 : 3x3 Identity matrix
-    // Compute The Deformation Gradient : F = I3 + grad_u
-    const CeedScalar F[3][3] = {
-        {grad_u[0][0][i] + 1, grad_u[0][1][i],     grad_u[0][2][i]    },
-        {grad_u[1][0][i],     grad_u[1][1][i] + 1, grad_u[1][2][i]    },
-        {grad_u[2][0][i],     grad_u[2][1][i],     grad_u[2][2][i] + 1}
-    };
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = grad_u[indj[m]][indk[m]][i] + grad_u[indk[m]][indj[m]][i];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += grad_u[n][indj[m]][i] * grad_u[n][indk[m]][i];
-    }
-
-    const CeedScalar tempgradu[3][3] = {
-        {grad_u[0][0][i], grad_u[0][1][i], grad_u[0][2][i]},
-        {grad_u[1][0][i], grad_u[1][1][i], grad_u[1][2][i]},
-        {grad_u[2][0][i], grad_u[2][1][i], grad_u[2][2][i]}
-    };
-
-    const CeedScalar Jm1  = computeJM1(tempgradu);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-    // store lam_log_J = lambda*log(J)
-    lam_log_J[0][i] = lambda * logJ;
-
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-
-    // C : right Cauchy-Green tensor
-    // C = I + 2E
-    const CeedScalar C[3][3] = {
-        {1 + E2[0][0], E2[0][1],     E2[0][2]    },
-        {E2[0][1],     1 + E2[1][1], E2[1][2]    },
-        {E2[0][2],     E2[1][2],     1 + E2[2][2]}
-    };
-
-    // Compute C^(-1) : C-Inverse
-    const CeedScalar detC = (Jm1 + 1.) * (Jm1 + 1.);
-    CeedScalar       Cinvwork[6];
-    computeMatinvSym(C, detC, Cinvwork);
-
-    // store C_inv
-    C_inv[0][i] = Cinvwork[0];
-    C_inv[1][i] = Cinvwork[1];
-    C_inv[2][i] = Cinvwork[2];
-    C_inv[3][i] = Cinvwork[3];
-    C_inv[4][i] = Cinvwork[4];
-    C_inv[5][i] = Cinvwork[5];
-
-    const CeedScalar tempCinv[3][3] = {
-        {C_inv[0][i], C_inv[5][i], C_inv[4][i]},
-        {C_inv[5][i], C_inv[1][i], C_inv[3][i]},
-        {C_inv[4][i], C_inv[3][i], C_inv[2][i]}
-    };
-    CeedScalar Swork[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      Swork[m] = lam_log_J[0][i] * C_inv[m][i];
-      for (CeedInt n = 0; n < 3; n++) Swork[m] += mu * tempCinv[indj[m]][n] * E2[n][indk[m]];
-    }
-    // Second Piola-Kirchhoff (S)
-    const CeedScalar S[3][3] = {
-        {Swork[0], Swork[5], Swork[4]},
-        {Swork[5], Swork[1], Swork[3]},
-        {Swork[4], Swork[3], Swork[2]}
-    };
-
-    // Compute the First Piola-Kirchhoff : P = F*S
-    CeedScalar P[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        P[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) P[j][k] += F[j][m] * S[m][k];
-      }
-    }
-
-    // Apply dXdx^T and weight to P (First Piola-Kirchhoff)
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        dvdX[k][j][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) dvdX[k][j][i] += dXdx[k][m] * P[j][m] * wdetJ;
-      }
-    }
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Jacobian evaluation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH2dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
-        (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-  // grad_u is used for hyperelasticity (non-linear)
-  const CeedScalar(*grad_u)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2];
-  const CeedScalar(*C_inv)[CEED_Q_VLA]     = (const CeedScalar(*)[CEED_Q_VLA])in[3];
-  // lam_log_J = lambda*log(J)
-  const CeedScalar(*lam_log_J)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[4];
-  // Outputs
-  CeedScalar(*deltadvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar mu     = TwoMu / 2;
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of delta_u
-    const CeedScalar deltadu[3][3] = {
-        {deltaug[0][0][i], deltaug[1][0][i], deltaug[2][0][i]},
-        {deltaug[0][1][i], deltaug[1][1][i], deltaug[2][1][i]},
-        {deltaug[0][2][i], deltaug[1][2][i], deltaug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute graddeltau
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to deltadu = graddelta
-    CeedScalar graddeltau[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        graddeltau[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) graddeltau[j][k] += dXdx[m][k] * deltadu[j][m];
-      }
-    }
-
-    // I3 : 3x3 Identity matrix
-    // Deformation Gradient : F = I3 + grad_u
-    const CeedScalar F[3][3] = {
-        {grad_u[0][0][i] + 1, grad_u[0][1][i],     grad_u[0][2][i]    },
-        {grad_u[1][0][i],     grad_u[1][1][i] + 1, grad_u[1][2][i]    },
-        {grad_u[2][0][i],     grad_u[2][1][i],     grad_u[2][2][i] + 1}
-    };
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = grad_u[indj[m]][indk[m]][i] + grad_u[indk[m]][indj[m]][i];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += grad_u[n][indj[m]][i] * grad_u[n][indk[m]][i];
-    }
-
-    // deltaE - Green-Lagrange strain tensor
-    CeedScalar deltaEwork[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      deltaEwork[m] = 0;
-      for (CeedInt n = 0; n < 3; n++) deltaEwork[m] += (graddeltau[n][indj[m]] * F[n][indk[m]] + F[n][indj[m]] * graddeltau[n][indk[m]]) / 2.;
-    }
-    CeedScalar deltaE[3][3] = {
-        {deltaEwork[0], deltaEwork[5], deltaEwork[4]},
-        {deltaEwork[5], deltaEwork[1], deltaEwork[3]},
-        {deltaEwork[4], deltaEwork[3], deltaEwork[2]}
-    };
-
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-
-    const CeedScalar tempCinv[3][3] = {
-        {C_inv[0][i], C_inv[5][i], C_inv[4][i]},
-        {C_inv[5][i], C_inv[1][i], C_inv[3][i]},
-        {C_inv[4][i], C_inv[3][i], C_inv[2][i]}
-    };
-    CeedScalar Swork[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      Swork[m] = lam_log_J[0][i] * C_inv[m][i];
-      for (CeedInt n = 0; n < 3; n++) Swork[m] += mu * tempCinv[indj[m]][n] * E2[n][indk[m]];
-    }
-    // Second Piola-Kirchhoff (S)
-    const CeedScalar S[3][3] = {
-        {Swork[0], Swork[5], Swork[4]},
-        {Swork[5], Swork[1], Swork[3]},
-        {Swork[4], Swork[3], Swork[2]}
-    };
-
-    // deltaS = dSdE:deltaE
-    //      = lambda(C_inv:deltaE)C_inv + 2(mu-lambda*log(J))C_inv*deltaE*C_inv
-    // -- C_inv:deltaE
-    CeedScalar Cinv_contract_E = 0;
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) Cinv_contract_E += tempCinv[j][k] * deltaE[j][k];
-    }
-    // -- deltaE*C_inv
-    CeedScalar deltaECinv[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        deltaECinv[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) deltaECinv[j][k] += deltaE[j][m] * tempCinv[m][k];
-      }
-    }
-    // -- intermediate deltaS = C_inv*deltaE*C_inv
-    CeedScalar deltaS[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        deltaS[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) deltaS[j][k] += tempCinv[j][m] * deltaECinv[m][k];
-      }
-    }
-    // -- deltaS = lambda(C_inv:deltaE)C_inv - 2(lambda*log(J)-mu)*(intermediate)
-    const CeedScalar llnj_m = lam_log_J[0][i] - mu;
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) deltaS[j][k] = lambda * Cinv_contract_E * tempCinv[j][k] - 2. * llnj_m * deltaS[j][k];
-    }
-
-    // deltaP = dPdF:deltaF = deltaF*S + F*deltaS
-    CeedScalar deltaP[3][3];
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt k = 0; k < 3; k++) {
-        deltaP[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) deltaP[j][k] += graddeltau[j][m] * S[m][k] + F[j][m] * deltaS[m][k];
-      }
-    }
-
-    // Apply dXdx^T and weight
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        deltadvdX[k][j][i] = 0;
-        for (CeedInt m = 0; m < 3; m++) deltadvdX[k][j][i] += dXdx[k][m] * deltaP[j][m] * wdetJ;
-      }
-    }
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Strain energy computation for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH2Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*energy) = (CeedScalar(*))out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar grad_u[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        grad_u[j][k] = 0;
-        for (int m = 0; m < 3; m++) grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = grad_u[indj[m]][indk[m]] + grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += grad_u[n][indj[m]] * grad_u[n][indk[m]];
-    }
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-    const CeedScalar Jm1  = computeJM1(grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-
-    // Strain energy Phi(E) for compressible Neo-Hookean
-    energy[i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.) * wdetJ;
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Nodal diagnostic quantities for hyperelasticity, finite strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH2Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
-        (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
-
-  // Outputs
-  CeedScalar(*diagnostic)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-  const CeedScalar TwoMu   = E / (1 + nu);
-  const CeedScalar mu      = TwoMu / 2;
-  const CeedScalar Kbulk   = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda  = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar grad_u[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        grad_u[j][k] = 0;
-        for (int m = 0; m < 3; m++) grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // E - Green-Lagrange strain tensor
-    //     E = 1/2 (grad_u + grad_u^T + grad_u^T*grad_u)
-    const CeedInt indj[6] = {0, 1, 2, 1, 0, 0}, indk[6] = {0, 1, 2, 2, 2, 1};
-    CeedScalar    E2work[6];
-    for (CeedInt m = 0; m < 6; m++) {
-      E2work[m] = grad_u[indj[m]][indk[m]] + grad_u[indk[m]][indj[m]];
-      for (CeedInt n = 0; n < 3; n++) E2work[m] += grad_u[n][indj[m]] * grad_u[n][indk[m]];
-    }
-    CeedScalar E2[3][3] = {
-        {E2work[0], E2work[5], E2work[4]},
-        {E2work[5], E2work[1], E2work[3]},
-        {E2work[4], E2work[3], E2work[2]}
-    };
-
-    // Displacement
-    diagnostic[0][i] = u[0][i];
-    diagnostic[1][i] = u[1][i];
-    diagnostic[2][i] = u[2][i];
-
-    // Pressure
-    const CeedScalar Jm1  = computeJM1(grad_u);
-    const CeedScalar logJ = log1p_series_shifted(Jm1);
-    diagnostic[3][i]      = -lambda * logJ;
-
-    // Stress tensor invariants
-    diagnostic[4][i] = (E2[0][0] + E2[1][1] + E2[2][2]) / 2.;
-    diagnostic[5][i] = 0.;
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt m = 0; m < 3; m++) diagnostic[5][i] += E2[j][m] * E2[m][j] / 4.;
-    }
-    diagnostic[6][i] = Jm1 + 1.;
-
-    // Strain energy
-    diagnostic[7][i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.);
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-// -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/finite-strain-neo-hookean-initial-1.h b/examples/solids/qfunctions/finite-strain-neo-hookean.h
similarity index 96%
rename from examples/solids/qfunctions/finite-strain-neo-hookean-initial-1.h
rename to examples/solids/qfunctions/finite-strain-neo-hookean.h
index 431c8e328a..5742c5e8ff 100644
--- a/examples/solids/qfunctions/finite-strain-neo-hookean-initial-1.h
+++ b/examples/solids/qfunctions/finite-strain-neo-hookean.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// Hyperelasticity, finite strain for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #ifndef PHYSICS_STRUCT
 #define PHYSICS_STRUCT
@@ -52,7 +54,7 @@ CEED_QFUNCTION_HELPER CeedScalar log1p_series_shifted(CeedScalar x) {
   y *= y2;
   sum += y / 7;
   return 2 * sum;
-};
+}
 #endif
 
 // -----------------------------------------------------------------------------
@@ -66,7 +68,7 @@ CEED_QFUNCTION_HELPER CeedScalar computeJM1(const CeedScalar grad_u[3][3]) {
          grad_u[0][2] * (grad_u[1][0] * grad_u[2][1] - grad_u[2][0] * grad_u[1][1]) + grad_u[0][0] + grad_u[1][1] + grad_u[2][2] +
          grad_u[0][0] * grad_u[1][1] + grad_u[0][0] * grad_u[2][2] + grad_u[1][1] * grad_u[2][2] - grad_u[0][1] * grad_u[1][0] -
          grad_u[0][2] * grad_u[2][0] - grad_u[1][2] * grad_u[2][1];
-};
+}
 #endif
 
 // -----------------------------------------------------------------------------
@@ -86,8 +88,8 @@ CEED_QFUNCTION_HELPER int computeMatinvSym(const CeedScalar A[3][3], const CeedS
   };
   for (CeedInt m = 0; m < 6; m++) Ainv[m] = B[m] / (detA);
 
-  return 0;
-};
+  return CEED_ERROR_SUCCESS;
+}
 #endif
 
 // -----------------------------------------------------------------------------
@@ -136,13 +138,13 @@ CEED_QFUNCTION_HELPER int commonFS(const CeedScalar lambda, const CeedScalar mu,
     for (CeedInt n = 0; n < 3; n++) Swork[m] += mu * C_inv[indj[m]][n] * E2[n][indk[m]];
   }
 
-  return 0;
-};
+  return CEED_ERROR_SUCCESS;
+}
 
 // -----------------------------------------------------------------------------
 // Residual evaluation for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH1F)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSResidual_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -242,13 +244,13 @@ CEED_QFUNCTION(ElasFSInitialNH1F)(void *ctx, CeedInt Q, const CeedScalar *const
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Jacobian evaluation for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH1dF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSJacobian_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
         (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
@@ -387,13 +389,13 @@ CEED_QFUNCTION(ElasFSInitialNH1dF)(void *ctx, CeedInt Q, const CeedScalar *const
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Strain energy computation for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH1Energy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSEnergy_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -457,13 +459,13 @@ CEED_QFUNCTION(ElasFSInitialNH1Energy)(void *ctx, CeedInt Q, const CeedScalar *c
 
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Nodal diagnostic quantities for hyperelasticity, finite strain
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasFSInitialNH1Diagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasFSDiagnostic_NH)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
         (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
@@ -542,6 +544,6 @@ CEED_QFUNCTION(ElasFSInitialNH1Diagnostic)(void *ctx, CeedInt Q, const CeedScala
     diagnostic[7][i] = (lambda * logJ * logJ / 2. - mu * logJ + mu * (E2[0][0] + E2[1][1] + E2[2][2]) / 2.);
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 // -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/linear.h b/examples/solids/qfunctions/linear.h
index b688fdf495..b6f9573c05 100644
--- a/examples/solids/qfunctions/linear.h
+++ b/examples/solids/qfunctions/linear.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// Linear elasticity for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #ifndef PHYSICS_STRUCT
 #define PHYSICS_STRUCT
@@ -23,7 +25,7 @@ struct Physics_private {
 // -----------------------------------------------------------------------------
 // Residual evaluation for linear elasticity
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasLinearF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasResidual_Linear)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -113,13 +115,13 @@ CEED_QFUNCTION(ElasLinearF)(void *ctx, CeedInt Q, const CeedScalar *const *in, C
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Jacobian evaluation for linear elasticity
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasLineardF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasJacobian_Linear)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
         (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
@@ -208,13 +210,13 @@ CEED_QFUNCTION(ElasLineardF)(void *ctx, CeedInt Q, const CeedScalar *const *in,
     }
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Strain energy computation for linear elasticity
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasLinearEnergy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasEnergy_Linear)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
 
@@ -275,13 +277,13 @@ CEED_QFUNCTION(ElasLinearEnergy)(void *ctx, CeedInt Q, const CeedScalar *const *
 
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 
 // -----------------------------------------------------------------------------
 // Nodal diagnostic quantities for linear elasticity
 // -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasLinearDiagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+CEED_QFUNCTION(ElasDiagnostic_Linear)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Inputs
   const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
         (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
@@ -357,6 +359,6 @@ CEED_QFUNCTION(ElasLinearDiagnostic)(void *ctx, CeedInt Q, const CeedScalar *con
         (lambda * strain_vol * strain_vol / 2. + strain_vol * mu + (e[0][1] * e[0][1] + e[0][2] * e[0][2] + e[1][2] * e[1][2]) * 2 * mu);
   }  // End of Quadrature Point Loop
 
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
 // -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/manufactured-force.h b/examples/solids/qfunctions/manufactured-force.h
index 0764d103e3..41b761351f 100644
--- a/examples/solids/qfunctions/manufactured-force.h
+++ b/examples/solids/qfunctions/manufactured-force.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// Linear elasticity manufactured solution forcing term for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 #ifndef PHYSICS_STRUCT
 #define PHYSICS_STRUCT
diff --git a/examples/solids/qfunctions/manufactured-true.h b/examples/solids/qfunctions/manufactured-true.h
index 389fb6596f..25cffbd126 100644
--- a/examples/solids/qfunctions/manufactured-true.h
+++ b/examples/solids/qfunctions/manufactured-true.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,10 @@
 /// @file
 /// Linear elasticity manufactured solution true solution for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
+#ifndef CEED_RUNNING_JIT_PASS
 #include <math.h>
+#endif
 
 // -----------------------------------------------------------------------------
 // True solution for linear elasticity manufactured solution
diff --git a/examples/solids/qfunctions/small-strain-neo-hookean.h b/examples/solids/qfunctions/small-strain-neo-hookean.h
deleted file mode 100644
index 95e0afa66c..0000000000
--- a/examples/solids/qfunctions/small-strain-neo-hookean.h
+++ /dev/null
@@ -1,410 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Hyperelasticity, small strain for solid mechanics example using PETSc
-
-#include <ceed.h>
-#include <math.h>
-
-#ifndef PHYSICS_STRUCT
-#define PHYSICS_STRUCT
-typedef struct Physics_private *Physics;
-struct Physics_private {
-  CeedScalar nu;  // Poisson's ratio
-  CeedScalar E;   // Young's Modulus
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Series approximation of log1p()
-//  log1p() is not vectorized in libc
-//
-//  The series expansion is accurate to 1e-7 in the range sqrt(2)/2 < J < sqrt(2), with machine precision accuracy near J=1.
-// -----------------------------------------------------------------------------
-#ifndef LOG1P_SERIES
-#define LOG1P_SERIES
-CEED_QFUNCTION_HELPER CeedScalar log1p_series(CeedScalar x) {
-  CeedScalar       sum = 0;
-  CeedScalar       y   = x / (2. + x);
-  const CeedScalar y2  = y * y;
-  sum += y;
-  y *= y2;
-  sum += y / 3;
-  y *= y2;
-  sum += y / 5;
-  y *= y2;
-  sum += y / 7;
-  return 2 * sum;
-};
-#endif
-
-// -----------------------------------------------------------------------------
-// Residual evaluation for hyperelasticity, small strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasSSNHF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*dvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-  // Store grad_u for HyperFSdF (Jacobian of HyperFSF)
-  CeedScalar(*grad_u)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[1];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        grad_u[j][k][i] = 0;
-        for (int m = 0; m < 3; m++) grad_u[j][k][i] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // Compute Strain : e (epsilon)
-    // e = 1/2 (grad u + (grad u)^T)
-    const CeedScalar e00 = (grad_u[0][0][i] + grad_u[0][0][i]) / 2., e01 = (grad_u[0][1][i] + grad_u[1][0][i]) / 2.,
-                     e02 = (grad_u[0][2][i] + grad_u[2][0][i]) / 2., e11 = (grad_u[1][1][i] + grad_u[1][1][i]) / 2.,
-                     e12 = (grad_u[1][2][i] + grad_u[2][1][i]) / 2., e22 = (grad_u[2][2][i] + grad_u[2][2][i]) / 2.;
-    const CeedScalar e[3][3] = {
-        {e00, e01, e02},
-        {e01, e11, e12},
-        {e02, e12, e22}
-    };
-
-    // strain (epsilon)
-    //    and
-    // stress (sigma) in Voigt notation:
-    //           [e00]              [sigma00]
-    //           [e11]              [sigma11]
-    // epsilon = [e22]  ,   sigma = [sigma22]
-    //           [e12]              [sigma12]
-    //           [e02]              [sigma02]
-    //           [e01]              [sigma01]
-    //
-    // mu = E / (2 * (1 + nu))
-    // bulk modulus = E / (2 * (1 - 2 * nu))
-    // lambda = (3 * bulk modulus - 2 * mu) / 3
-    // e_v = volumetric strain = e00 + e11 + e22
-    //
-    // sigma = lambda * log(1 + e_v) + 2 * mu * epsilon
-    //
-    // Above Voigt Notation is placed in a 3x3 matrix:
-    // Volumetric strain
-    const CeedScalar strain_vol = e[0][0] + e[1][1] + e[2][2];
-    const CeedScalar llv        = log1p_series(strain_vol);
-    const CeedScalar sigma00 = lambda * llv + TwoMu * e[0][0], sigma11 = lambda * llv + TwoMu * e[1][1], sigma22 = lambda * llv + TwoMu * e[2][2],
-                     sigma12 = TwoMu * e[1][2], sigma02 = TwoMu * e[0][2], sigma01 = TwoMu * e[0][1];
-    const CeedScalar sigma[3][3] = {
-        {sigma00, sigma01, sigma02},
-        {sigma01, sigma11, sigma12},
-        {sigma02, sigma12, sigma22}
-    };
-
-    // Apply dXdx^T and weight to sigma
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        dvdX[k][j][i] = 0;
-        for (int m = 0; m < 3; m++) dvdX[k][j][i] += dXdx[k][m] * sigma[j][m] * wdetJ;
-      }
-    }
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Jacobian evaluation for hyperelasticity, small strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasSSNHdF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*deltaug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0],
-        (*q_data)[CEED_Q_VLA]               = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-  // grad_u is used for hyperelasticity (non-linear)
-  const CeedScalar(*grad_u)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[2];
-
-  // Outputs
-  CeedScalar(*deltadvdX)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar deltadu[3][3] = {
-        {deltaug[0][0][i], deltaug[1][0][i], deltaug[2][0][i]},
-        {deltaug[0][1][i], deltaug[1][1][i], deltaug[2][1][i]},
-        {deltaug[0][2][i], deltaug[1][2][i], deltaug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute graddeltau
-    // Apply dXdx^-1 to deltadu = graddeltau
-    CeedScalar graddeltau[3][3];
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        graddeltau[j][k] = 0;
-        for (int m = 0; m < 3; m++) graddeltau[j][k] += dXdx[m][k] * deltadu[j][m];
-      }
-    }
-
-    // Compute Strain : e (epsilon)
-    // e = 1/2 (grad u + (grad u)^T)
-    const CeedScalar de00 = (graddeltau[0][0] + graddeltau[0][0]) / 2., de01 = (graddeltau[0][1] + graddeltau[1][0]) / 2.,
-                     de02 = (graddeltau[0][2] + graddeltau[2][0]) / 2., de11 = (graddeltau[1][1] + graddeltau[1][1]) / 2.,
-                     de12 = (graddeltau[1][2] + graddeltau[2][1]) / 2., de22 = (graddeltau[2][2] + graddeltau[2][2]) / 2.;
-    const CeedScalar de[3][3] = {
-        {de00, de01, de02},
-        {de01, de11, de12},
-        {de02, de12, de22}
-    };
-
-    // strain (epsilon)
-    //     and
-    // stress (sigma) in Voigt notation:
-    //             [e00]               [sigma00]
-    //             [e11]               [sigma11]
-    //  depsilon = [e22]  ,   dsigma = [sigma22]
-    //             [e12]               [sigma12]
-    //             [e02]               [sigma02]
-    //             [e01]               [sigma01]
-    //
-    //  mu = E / (2 * (1 + nu))
-    //  bulk modulus = E / (2 * (1 - 2 * nu))
-    //  lambda = (3 * bulk modulus - 2 * mu) / 3
-    //  e_v = volumetric strain = e00 + e11 + e22
-    //  lambda bar = lambda / (1 + e_v)
-    //
-    //  dSigma = S * epsilon
-    //
-    //  S_ijkl = lambda bar * delta_ij * delta_kl + 2 * mu * delta_ik * delta_jl
-    //
-    //  Matrix form:
-    //
-    //      [2 mu + lambda bar     lambda bar         lambda bar                       ]
-    //      [   lambda bar      2 mu + lambda bar     lambda bar                       ]
-    //      [   lambda bar         lambda bar      2 mu + lambda bar                   ]
-    //  S = [                                                           mu             ]
-    //      [                                                                 mu       ]
-    //      [                                                                       mu ]
-    //
-    //  Above Voigt Notation is placed in a 3x3 matrix:
-    const CeedScalar strain_vol    = grad_u[0][0][i] + grad_u[1][1][i] + grad_u[2][2][i];
-    const CeedScalar lambda_bar    = lambda / (1 + strain_vol);
-    const CeedScalar lambda_dtrace = lambda_bar * (de[0][0] + de[1][1] + de[2][2]);
-    const CeedScalar dsigma00 = lambda_dtrace + TwoMu * de[0][0], dsigma11 = lambda_dtrace + TwoMu * de[1][1],
-                     dsigma22 = lambda_dtrace + TwoMu * de[2][2], dsigma12 = TwoMu * de[1][2], dsigma02 = TwoMu * de[0][2],
-                     dsigma01     = TwoMu * de[0][1];
-    const CeedScalar dsigma[3][3] = {
-        {dsigma00, dsigma01, dsigma02},
-        {dsigma01, dsigma11, dsigma12},
-        {dsigma02, dsigma12, dsigma22}
-    };
-
-    // Apply dXdx^-T and weight
-    for (int j = 0; j < 3; j++) {    // Component
-      for (int k = 0; k < 3; k++) {  // Derivative
-        deltadvdX[k][j][i] = 0;
-        for (int m = 0; m < 3; m++) deltadvdX[k][j][i] += dXdx[k][m] * dsigma[j][m] * wdetJ;
-      }
-    }
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Strain energy computation for hyperelasticity, small strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasSSNHEnergy)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1];
-
-  // Outputs
-  CeedScalar(*energy) = (CeedScalar(*))out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar mu     = TwoMu / 2;
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar wdetJ      = q_data[0][i];
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar grad_u[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        grad_u[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // Compute Strain : e (epsilon)
-    // e = 1/2 (grad u + (grad u)^T)
-
-    const CeedScalar e[3][3] = {
-        {(grad_u[0][0] + grad_u[0][0]) / 2., (grad_u[0][1] + grad_u[1][0]) / 2., (grad_u[0][2] + grad_u[2][0]) / 2.},
-        {(grad_u[1][0] + grad_u[0][1]) / 2., (grad_u[1][1] + grad_u[1][1]) / 2., (grad_u[1][2] + grad_u[2][1]) / 2.},
-        {(grad_u[2][0] + grad_u[0][2]) / 2., (grad_u[2][1] + grad_u[1][2]) / 2., (grad_u[2][2] + grad_u[2][2]) / 2.}
-    };
-
-    // Strain Energy
-    const CeedScalar strain_vol = e[0][0] + e[1][1] + e[2][2];
-    const CeedScalar llv        = log1p_series(strain_vol);
-    energy[i] =
-        (lambda * (1 + strain_vol) * (llv - 1) + strain_vol * mu + (e[0][1] * e[0][1] + e[0][2] * e[0][2] + e[1][2] * e[1][2]) * 2 * mu) * wdetJ;
-
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-
-// -----------------------------------------------------------------------------
-// Nodal diagnostic quantities for hyperelasticity, small strain
-// -----------------------------------------------------------------------------
-CEED_QFUNCTION(ElasSSNHDiagnostic)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
-  // Inputs
-  const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[1],
-        (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2];
-
-  // Outputs
-  CeedScalar(*diagnostic)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0];
-
-  // Context
-  const Physics    context = (Physics)ctx;
-  const CeedScalar E       = context->E;
-  const CeedScalar nu      = context->nu;
-
-  // Constants
-  const CeedScalar TwoMu  = E / (1 + nu);
-  const CeedScalar mu     = TwoMu / 2;
-  const CeedScalar Kbulk  = E / (3 * (1 - 2 * nu));  // Bulk Modulus
-  const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3;
-
-  // Quadrature Point Loop
-  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
-    // Read spatial derivatives of u
-    const CeedScalar du[3][3] = {
-        {ug[0][0][i], ug[1][0][i], ug[2][0][i]},
-        {ug[0][1][i], ug[1][1][i], ug[2][1][i]},
-        {ug[0][2][i], ug[1][2][i], ug[2][2][i]}
-    };
-    // -- Qdata
-    const CeedScalar dXdx[3][3] = {
-        {q_data[1][i], q_data[2][i], q_data[3][i]},
-        {q_data[4][i], q_data[5][i], q_data[6][i]},
-        {q_data[7][i], q_data[8][i], q_data[9][i]}
-    };
-
-    // Compute grad_u
-    //   dXdx = (dx/dX)^(-1)
-    // Apply dXdx to du = grad_u
-    CeedScalar grad_u[3][3];
-    for (CeedInt j = 0; j < 3; j++) {    // Component
-      for (CeedInt k = 0; k < 3; k++) {  // Derivative
-        grad_u[j][k] = 0;
-        for (CeedInt m = 0; m < 3; m++) grad_u[j][k] += dXdx[m][k] * du[j][m];
-      }
-    }
-
-    // Compute Strain : e (epsilon)
-    // e = 1/2 (grad u + (grad u)^T)
-
-    const CeedScalar e[3][3] = {
-        {(grad_u[0][0] + grad_u[0][0]) / 2., (grad_u[0][1] + grad_u[1][0]) / 2., (grad_u[0][2] + grad_u[2][0]) / 2.},
-        {(grad_u[1][0] + grad_u[0][1]) / 2., (grad_u[1][1] + grad_u[1][1]) / 2., (grad_u[1][2] + grad_u[2][1]) / 2.},
-        {(grad_u[2][0] + grad_u[0][2]) / 2., (grad_u[2][1] + grad_u[1][2]) / 2., (grad_u[2][2] + grad_u[2][2]) / 2.}
-    };
-
-    // Displacement
-    diagnostic[0][i] = u[0][i];
-    diagnostic[1][i] = u[1][i];
-    diagnostic[2][i] = u[2][i];
-
-    // Pressure
-    const CeedScalar strain_vol = e[0][0] + e[1][1] + e[2][2];
-    const CeedScalar llv        = log1p_series(strain_vol);
-    diagnostic[3][i]            = -lambda * llv;
-
-    // Stress tensor invariants
-    diagnostic[4][i] = strain_vol;
-    diagnostic[5][i] = 0.;
-    for (CeedInt j = 0; j < 3; j++) {
-      for (CeedInt m = 0; m < 3; m++) diagnostic[5][i] += e[j][m] * e[m][j];
-    }
-    diagnostic[6][i] = 1 + strain_vol;
-
-    // Strain energy
-    diagnostic[7][i] =
-        (lambda * (1 + strain_vol) * (llv - 1) + strain_vol * mu + (e[0][1] * e[0][1] + e[0][2] * e[0][2] + e[1][2] * e[1][2]) * 2 * mu);
-  }  // End of Quadrature Point Loop
-
-  return 0;
-}
-// -----------------------------------------------------------------------------
diff --git a/examples/solids/qfunctions/traction-boundary.h b/examples/solids/qfunctions/traction-boundary.h
index 181b176d0a..6cc3c2e16d 100644
--- a/examples/solids/qfunctions/traction-boundary.h
+++ b/examples/solids/qfunctions/traction-boundary.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,7 +8,7 @@
 /// @file
 /// Geometric factors for solid mechanics example using PETSc
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 // -----------------------------------------------------------------------------
 // This QFunction computes the surface integral of the user traction vector on the constrained faces.
diff --git a/examples/solids/src/boundary.c b/examples/solids/src/boundary.c
index 2985d0d21a..2fdaac80ea 100644
--- a/examples/solids/src/boundary.c
+++ b/examples/solids/src/boundary.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/cl-options.c b/examples/solids/src/cl-options.c
index 935ee6c9b8..3dc3d7effb 100644
--- a/examples/solids/src/cl-options.c
+++ b/examples/solids/src/cl-options.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -57,10 +57,7 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx) {
   app_ctx->forcing_vector[2] = 0;
   PetscCall(PetscOptionsScalarArray("-forcing_vec", "Direction to apply constant force", NULL, app_ctx->forcing_vector, &max_n, NULL));
 
-  if ((app_ctx->problem_choice == ELAS_FSInitial_NH1 || app_ctx->problem_choice == ELAS_FSInitial_NH2 ||
-       app_ctx->problem_choice == ELAS_FSCurrent_NH1 || app_ctx->problem_choice == ELAS_FSCurrent_NH2 ||
-       app_ctx->problem_choice == ELAS_FSInitial_MR1) &&
-      app_ctx->forcing_choice == FORCE_CONST) {
+  if ((app_ctx->problem_choice == ELAS_FS_NH || app_ctx->problem_choice == ELAS_FS_MR) && app_ctx->forcing_choice == FORCE_CONST) {
     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP,
             "Cannot use constant forcing and finite strain formulation. "
             "Constant forcing in reference frame currently unavailable.");
@@ -68,8 +65,8 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx) {
 
   // Dirichlet boundary conditions
   app_ctx->bc_clamp_count = 16;
-  PetscCall(
-      PetscOptionsIntArray("-bc_clamp", "Face IDs to apply incremental Dirichlet BC", NULL, app_ctx->bc_clamp_faces, &app_ctx->bc_clamp_count, NULL));
+  PetscCall(PetscOptionsIntArray("-bc_clamp", "Face IDs to apply incremental Dirichlet BC", NULL, app_ctx->bc_clamp_faces, &app_ctx->bc_clamp_count,
+                                 NULL));
   // Set vector for each clamped BC
   for (PetscInt i = 0; i < app_ctx->bc_clamp_count; i++) {
     // Translation vector
diff --git a/examples/solids/src/matops.c b/examples/solids/src/matops.c
index bef9960fb5..31930d2446 100644
--- a/examples/solids/src/matops.c
+++ b/examples/solids/src/matops.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/misc.c b/examples/solids/src/misc.c
index d4f1986473..6c45e893b6 100644
--- a/examples/solids/src/misc.c
+++ b/examples/solids/src/misc.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/examples/solids/src/setup-dm.c b/examples/solids/src/setup-dm.c
index e70b4738dc..9a4d55a356 100644
--- a/examples/solids/src/setup-dm.c
+++ b/examples/solids/src/setup-dm.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -43,7 +43,7 @@ PetscErrorCode CreateDistributedDM(MPI_Comm comm, AppCtx app_ctx, DM *dm) {
     PetscInt dim = 3, faces[3] = {3, 3, 3};
     PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-dm_plex_box_faces", faces, &dim, NULL));
     if (!dim) dim = 3;
-    PetscCall(DMPlexCreateBoxMesh(comm, dim, PETSC_FALSE, faces, NULL, NULL, NULL, interpolate, dm));
+    PetscCall(DMPlexCreateBoxMesh(comm, dim, PETSC_FALSE, faces, NULL, NULL, NULL, interpolate, 0, PETSC_FALSE, dm));
   } else {
     PetscCall(DMPlexCreateFromFile(comm, filename, NULL, interpolate, dm));
   }
diff --git a/examples/solids/src/setup-libceed.c b/examples/solids/src/setup-libceed.c
index 608278ec5c..bfe153fcbf 100644
--- a/examples/solids/src/setup-libceed.c
+++ b/examples/solids/src/setup-libceed.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -20,11 +20,6 @@
 #include "../qfunctions/manufactured-force.h"  // Manufactured solution forcing
 #include "../qfunctions/traction-boundary.h"   // Traction boundaries
 
-#if PETSC_VERSION_LT(3, 14, 0)
-#define DMPlexGetClosureIndices(a, b, c, d, e, f, g, h, i) DMPlexGetClosureIndices(a, b, c, d, f, g, i)
-#define DMPlexRestoreClosureIndices(a, b, c, d, e, f, g, h, i) DMPlexRestoreClosureIndices(a, b, c, d, f, g, i)
-#endif
-
 // -----------------------------------------------------------------------------
 // Problem options
 // -----------------------------------------------------------------------------
@@ -321,8 +316,8 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, Ceed
       CeedOperator        op_traction;
       CeedQFunctionContextSetData(traction_ctx, CEED_MEM_HOST, CEED_USE_POINTER, 3 * sizeof(CeedScalar), app_ctx->bc_traction_vector[i]);
       // Setup restriction
-      PetscCall(
-          GetRestrictionForDomain(ceed, dm, 1, domain_label, app_ctx->bc_traction_faces[i], Q, 0, &elem_restr_u_face, &elem_restr_x_face, NULL));
+      PetscCall(GetRestrictionForDomain(ceed, dm, 1, domain_label, app_ctx->bc_traction_faces[i], Q, 0, &elem_restr_u_face, &elem_restr_x_face,
+                                        NULL));
       // ---- Create boundary Operator
       CeedOperatorCreate(ceed, qf_traction, NULL, NULL, &op_traction);
       CeedOperatorSetField(op_traction, "dx", elem_restr_x_face, basis_x_face, CEED_VECTOR_ACTIVE);
diff --git a/gallery/ceed-gallery-list.h b/gallery/ceed-gallery-list.h
index 4fa8a08227..c1829eef64 100644
--- a/gallery/ceed-gallery-list.h
+++ b/gallery/ceed-gallery-list.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -13,6 +13,7 @@
 // At the time of this writing, all the gallery functions are defined, but we're adopting the same strategy here as for the backends because future gallery @ref CeedQFunction might depend on external libraries.
 
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Identity)
+CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_IdentityScalar)
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Mass1DBuild)
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Mass2DBuild)
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Mass3DBuild)
@@ -28,3 +29,4 @@ CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Vector3Poisson1DApply)
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Vector3Poisson2DApply)
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Vector3Poisson3DApply)
 CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_Scale)
+CEED_GALLERY_QFUNCTION(CeedQFunctionRegister_ScaleScalar)
diff --git a/gallery/ceed-gallery-weak.c b/gallery/ceed-gallery-weak.c
deleted file mode 100644
index bb983b9a56..0000000000
--- a/gallery/ceed-gallery-weak.c
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-#include <ceed/backend.h>
-#include <ceed/types.h>
-
-// This function provides a debug target for weak symbols
-// LCOV_EXCL_START
-static int CeedQFunctionRegister_Weak(const char *name) {
-  CeedDebugEnv("** Weak Register: %s", name);
-  return CEED_ERROR_SUCCESS;
-}
-// LCOV_EXCL_STOP
-
-#define CEED_GALLERY_QFUNCTION(name)                \
-  CEED_INTERN int name(void) __attribute__((weak)); \
-  int             name(void) { return CeedQFunctionRegister_Weak(__func__); }
-#include "ceed-gallery-list.h"
-#undef CEED_GALLERY_QFUNCTION
diff --git a/gallery/identity/ceed-identity-to-scalar.c b/gallery/identity/ceed-identity-to-scalar.c
new file mode 100644
index 0000000000..403fcbafe5
--- /dev/null
+++ b/gallery/identity/ceed-identity-to-scalar.c
@@ -0,0 +1,34 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <ceed/jit-source/gallery/ceed-identity-to-scalar.h>
+#include <stddef.h>
+#include <string.h>
+
+/**
+  @brief Set fields identity `CeedQFunction` that copies first input component directly into output
+**/
+static int CeedQFunctionInit_IdentityScalar(Ceed ceed, const char *requested, CeedQFunction qf) {
+  // Check QFunction name
+  const char *name = "Identity to scalar";
+
+  CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
+
+  // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here
+
+  CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 0));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Register identity `CeedQFunction` that copies first input component directly into output
+**/
+CEED_INTERN int CeedQFunctionRegister_IdentityScalar(void) {
+  return CeedQFunctionRegister("Identity to scalar", IdentityScalar_loc, 1, IdentityScalar, CeedQFunctionInit_IdentityScalar);
+}
diff --git a/gallery/identity/ceed-identity.c b/gallery/identity/ceed-identity.c
index aa7f59eda4..415d19a274 100644
--- a/gallery/identity/ceed-identity.c
+++ b/gallery/identity/ceed-identity.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -17,21 +17,22 @@
 static int CeedQFunctionInit_Identity(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Identity";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here
 
-  CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 0));
-
   // Context data
   CeedQFunctionContext ctx;
   IdentityCtx          ctx_data = {.size = 1};
+
   CeedCall(CeedQFunctionContextCreate(ceed, &ctx));
   CeedCall(CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(ctx_data), &ctx_data));
   CeedCall(CeedQFunctionContextRegisterInt32(ctx, "size", offsetof(IdentityCtx, size), 1, "field size of identity QFunction"));
   CeedCall(CeedQFunctionSetContext(qf, ctx));
   CeedCall(CeedQFunctionContextDestroy(&ctx));
 
+  CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 0));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/mass-vector/ceed-vectormassapply.c b/gallery/mass-vector/ceed-vectormassapply.c
index c47d77fe48..bae789a0c5 100644
--- a/gallery/mass-vector/ceed-vectormassapply.c
+++ b/gallery/mass-vector/ceed-vectormassapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Vector3MassApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Vector3MassApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt num_comp = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "u", num_comp, CEED_EVAL_INTERP));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", 1, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "v", num_comp, CEED_EVAL_INTERP));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/mass/ceed-mass1dbuild.c b/gallery/mass/ceed-mass1dbuild.c
index db4454f7b6..7931ad0c36 100644
--- a/gallery/mass/ceed-mass1dbuild.c
+++ b/gallery/mass/ceed-mass1dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Mass1DBuild(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Mass1DBuild";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 1;
+
   CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT));
   CeedCall(CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/mass/ceed-mass2dbuild.c b/gallery/mass/ceed-mass2dbuild.c
index 52e10dec27..961ddbf2e9 100644
--- a/gallery/mass/ceed-mass2dbuild.c
+++ b/gallery/mass/ceed-mass2dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Mass2DBuild(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Mass2DBuild";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 2;
+
   CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT));
   CeedCall(CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 4));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/mass/ceed-mass3dbuild.c b/gallery/mass/ceed-mass3dbuild.c
index fcb3ab23f3..e4edf2dd85 100644
--- a/gallery/mass/ceed-mass3dbuild.c
+++ b/gallery/mass/ceed-mass3dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Mass3DBuild(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Mass3DBuild";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT));
   CeedCall(CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 15));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/mass/ceed-massapply.c b/gallery/mass/ceed-massapply.c
index d213a7a359..11c19aa799 100644
--- a/gallery/mass/ceed-massapply.c
+++ b/gallery/mass/ceed-massapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,6 +16,7 @@
 static int CeedQFunctionInit_MassApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "MassApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
@@ -24,7 +25,6 @@ static int CeedQFunctionInit_MassApply(Ceed ceed, const char *requested, CeedQFu
   CeedCall(CeedQFunctionAddOutput(qf, "v", 1, CEED_EVAL_INTERP));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson-vector/ceed-vectorpoisson1dapply.c b/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
index 2e8578d5a3..d49026a97d 100644
--- a/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
+++ b/gallery/poisson-vector/ceed-vectorpoisson1dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Vector3Poisson1DApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Vector3Poisson1DApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 1, num_comp = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "du", num_comp * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "dv", num_comp * dim, CEED_EVAL_GRAD));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson-vector/ceed-vectorpoisson2dapply.c b/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
index 8eb96609ff..7e4031f477 100644
--- a/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
+++ b/gallery/poisson-vector/ceed-vectorpoisson2dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Vector3Poisson2DApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Vector3Poisson2DApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 2, num_comp = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "du", num_comp * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "dv", num_comp * dim, CEED_EVAL_GRAD));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp * 6));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson-vector/ceed-vectorpoisson3dapply.c b/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
index 2506db2b45..9e1864287f 100644
--- a/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
+++ b/gallery/poisson-vector/ceed-vectorpoisson3dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Vector3Poisson3DApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Vector3Poisson3DApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 3, num_comp = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "du", num_comp * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "dv", num_comp * dim, CEED_EVAL_GRAD));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp * 15));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson/ceed-poisson1dapply.c b/gallery/poisson/ceed-poisson1dapply.c
index a9b6cef825..b007a60092 100644
--- a/gallery/poisson/ceed-poisson1dapply.c
+++ b/gallery/poisson/ceed-poisson1dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Poisson1DApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Poisson1DApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 1;
+
   CeedCall(CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson/ceed-poisson1dbuild.c b/gallery/poisson/ceed-poisson1dbuild.c
index 69f4e1fb50..cd8075a0e8 100644
--- a/gallery/poisson/ceed-poisson1dbuild.c
+++ b/gallery/poisson/ceed-poisson1dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Poisson1DBuild(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Poisson1DBuild";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 1;
+
   CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT));
   CeedCall(CeedQFunctionAddOutput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson/ceed-poisson2dapply.c b/gallery/poisson/ceed-poisson2dapply.c
index 5eb2d058bb..d055386dfe 100644
--- a/gallery/poisson/ceed-poisson2dapply.c
+++ b/gallery/poisson/ceed-poisson2dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Poisson2DApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Poisson2DApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 2;
+
   CeedCall(CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 6));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson/ceed-poisson2dbuild.c b/gallery/poisson/ceed-poisson2dbuild.c
index 60a13dd7a6..7768b0d29f 100644
--- a/gallery/poisson/ceed-poisson2dbuild.c
+++ b/gallery/poisson/ceed-poisson2dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Poisson2DBuild(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Poisson2DBuild";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 2;
+
   CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT));
   CeedCall(CeedQFunctionAddOutput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 17));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson/ceed-poisson3dapply.c b/gallery/poisson/ceed-poisson3dapply.c
index 7af449b13e..bcce1a9dc1 100644
--- a/gallery/poisson/ceed-poisson3dapply.c
+++ b/gallery/poisson/ceed-poisson3dapply.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Poisson3DApply(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Poisson3DApply";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
   CeedCall(CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 15));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/poisson/ceed-poisson3dbuild.c b/gallery/poisson/ceed-poisson3dbuild.c
index 5471701b10..3ae866ed7d 100644
--- a/gallery/poisson/ceed-poisson3dbuild.c
+++ b/gallery/poisson/ceed-poisson3dbuild.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,16 +16,17 @@
 static int CeedQFunctionInit_Poisson3DBuild(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Poisson3DBuild";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // Add QFunction fields
   const CeedInt dim = 3;
+
   CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD));
   CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT));
   CeedCall(CeedQFunctionAddOutput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE));
 
   CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 69));
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/gallery/scale/ceed-scale-scalar.c b/gallery/scale/ceed-scale-scalar.c
new file mode 100644
index 0000000000..ff950dbf49
--- /dev/null
+++ b/gallery/scale/ceed-scale-scalar.c
@@ -0,0 +1,31 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <ceed/jit-source/gallery/ceed-scale-scalar.h>
+#include <string.h>
+
+/**
+  @brief  Set fields for vector scaling `CeedQFunction` that scales inputs
+**/
+static int CeedQFunctionInit_ScaleScalar(Ceed ceed, const char *requested, CeedQFunction qf) {
+  // Check QFunction name
+  const char *name = "Scale (scalar)";
+
+  CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
+
+  // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Register scaling `CeedQFunction`
+**/
+CEED_INTERN int CeedQFunctionRegister_ScaleScalar(void) {
+  return CeedQFunctionRegister("Scale (scalar)", ScaleScalar_loc, 1, ScaleScalar, CeedQFunctionInit_ScaleScalar);
+}
diff --git a/gallery/scale/ceed-scale.c b/gallery/scale/ceed-scale.c
index 93fd9be24e..f998ac38e4 100644
--- a/gallery/scale/ceed-scale.c
+++ b/gallery/scale/ceed-scale.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -16,10 +16,10 @@
 static int CeedQFunctionInit_Scale(Ceed ceed, const char *requested, CeedQFunction qf) {
   // Check QFunction name
   const char *name = "Scale";
+
   CeedCheck(!strcmp(name, requested), ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested);
 
   // QFunction fields 'input' and 'output' with requested emodes added by the library rather than being added here
-
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/include/ceed-fortran-name.h b/include/ceed-fortran-name.h
index 192356fbc6..1646f3deeb 100644
--- a/include/ceed-fortran-name.h
+++ b/include/ceed-fortran-name.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 52b6beb633..e5f8773f37 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -82,16 +82,37 @@ typedef struct {
   Ceed  delegate;
 } ObjDelegate;
 
+// Work vector tracking
+typedef struct CeedWorkVectors_private *CeedWorkVectors;
+struct CeedWorkVectors_private {
+  CeedInt     num_vecs, max_vecs;
+  bool       *is_in_use;
+  CeedVector *vecs;
+};
+
+typedef struct CeedObject_private {
+  Ceed ceed;
+  int (*View)(CeedObject, FILE *);
+  int (*Destroy)(CeedObject *);
+  int     ref_count;
+  CeedInt num_view_tabs;
+} CeedObject_private;
+
 struct Ceed_private {
-  const char  *resource;
-  Ceed         delegate;
-  Ceed         parent;
-  ObjDelegate *obj_delegates;
-  int          obj_delegate_count;
-  Ceed         op_fallback_ceed, op_fallback_parent;
-  const char  *op_fallback_resource;
-  char       **jit_source_roots;
-  CeedInt      num_jit_source_roots;
+  CeedObject_private obj;
+  const char        *resource;
+  Ceed               delegate;
+  Ceed               parent;
+  ObjDelegate       *obj_delegates;
+  int                obj_delegate_count;
+  Ceed               op_fallback_ceed;
+  char             **jit_source_roots;
+  char             **rust_source_roots;
+  CeedInt            num_rust_source_roots, max_rust_source_roots, num_rust_source_roots_readers;
+  CeedInt            num_jit_source_roots, max_jit_source_roots, num_jit_source_roots_readers;
+  bool               cuda_compile_with_clang;
+  char             **jit_defines;
+  CeedInt            num_jit_defines, max_jit_defines, num_jit_defines_readers;
   int (*Error)(Ceed, const char *, int, const char *, int, const char *, va_list *);
   int (*SetStream)(Ceed, void *);
   int (*GetPreferredMemType)(CeedMemType *);
@@ -113,21 +134,22 @@ struct Ceed_private {
   int (*OperatorCreate)(CeedOperator);
   int (*OperatorCreateAtPoints)(CeedOperator);
   int (*CompositeOperatorCreate)(CeedOperator);
-  int      ref_count;
-  void    *data;
-  bool     is_debug;
-  bool     has_valid_op_fallback_resource;
-  bool     is_deterministic;
-  char     err_msg[CEED_MAX_RESOURCE_LEN];
-  FOffset *f_offsets;
+  void           *data;
+  bool            is_debug;
+  bool            is_deterministic;
+  char            err_msg[CEED_MAX_RESOURCE_LEN];
+  FOffset        *f_offsets;
+  CeedWorkVectors work_vectors;
 };
 
 struct CeedVector_private {
-  Ceed ceed;
+  CeedObject_private obj;
   int (*HasValidArray)(CeedVector, bool *);
   int (*HasBorrowedArrayOfType)(CeedVector, CeedMemType, bool *);
+  int (*CopyStrided)(CeedVector, CeedSize, CeedSize, CeedSize, CeedVector);
   int (*SetArray)(CeedVector, CeedMemType, CeedCopyMode, CeedScalar *);
   int (*SetValue)(CeedVector, CeedScalar);
+  int (*SetValueStrided)(CeedVector, CeedSize, CeedSize, CeedSize, CeedScalar);
   int (*SyncArray)(CeedVector, CeedMemType);
   int (*TakeArray)(CeedVector, CeedMemType, CeedScalar **);
   int (*GetArray)(CeedVector, CeedMemType, CeedScalar **);
@@ -142,7 +164,6 @@ struct CeedVector_private {
   int (*PointwiseMult)(CeedVector, CeedVector, CeedVector);
   int (*Reciprocal)(CeedVector);
   int (*Destroy)(CeedVector);
-  int      ref_count;
   CeedSize length;
   uint64_t state;
   uint64_t num_readers;
@@ -150,18 +171,18 @@ struct CeedVector_private {
 };
 
 struct CeedElemRestriction_private {
-  Ceed                ceed;
+  CeedObject_private  obj;
   CeedElemRestriction rstr_base;
   int (*Apply)(CeedElemRestriction, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *);
   int (*ApplyUnsigned)(CeedElemRestriction, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *);
   int (*ApplyUnoriented)(CeedElemRestriction, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *);
   int (*ApplyAtPointsInElement)(CeedElemRestriction, CeedInt, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *);
   int (*ApplyBlock)(CeedElemRestriction, CeedInt, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *);
+  int (*GetAtPointsElementOffset)(CeedElemRestriction, CeedInt, CeedSize *);
   int (*GetOffsets)(CeedElemRestriction, CeedMemType, const CeedInt **);
   int (*GetOrientations)(CeedElemRestriction, CeedMemType, const bool **);
   int (*GetCurlOrientations)(CeedElemRestriction, CeedMemType, const CeedInt8 **);
   int (*Destroy)(CeedElemRestriction);
-  int      ref_count;
   CeedInt  num_elem;    /* number of elements */
   CeedInt  elem_size;   /* number of nodes per element */
   CeedInt  num_points;  /* number of points, for points restriction */
@@ -181,11 +202,12 @@ struct CeedElemRestriction_private {
 };
 
 struct CeedBasis_private {
-  Ceed ceed;
+  CeedObject_private obj;
   int (*Apply)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector);
-  int (*ApplyAtPoints)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector);
+  int (*ApplyAdd)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector);
+  int (*ApplyAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector);
+  int (*ApplyAddAtPoints)(CeedBasis, CeedInt, const CeedInt *, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector);
   int (*Destroy)(CeedBasis);
-  int                ref_count;
   bool               is_tensor_basis; /* flag for tensor basis */
   CeedInt            dim;             /* topological dimension */
   CeedElemTopology   topo;            /* element topology */
@@ -212,11 +234,10 @@ struct CeedBasis_private {
 };
 
 struct CeedTensorContract_private {
-  Ceed ceed;
+  CeedObject_private obj;
   int (*Apply)(CeedTensorContract, CeedInt, CeedInt, CeedInt, CeedInt, const CeedScalar *restrict, CeedTransposeMode, const CeedInt,
                const CeedScalar *restrict, CeedScalar *restrict);
   int (*Destroy)(CeedTensorContract);
-  int   ref_count;
   void *data;
 };
 
@@ -227,12 +248,11 @@ struct CeedQFunctionField_private {
 };
 
 struct CeedQFunction_private {
-  Ceed ceed;
+  CeedObject_private obj;
   int (*Apply)(CeedQFunction, CeedInt, CeedVector *, CeedVector *);
   int (*SetCUDAUserFunction)(CeedQFunction, void *);
   int (*SetHIPUserFunction)(CeedQFunction, void *);
   int (*Destroy)(CeedQFunction);
-  int                  ref_count;
   CeedInt              vec_length; /* Number of quadrature points must be padded to a multiple of vec_length */
   CeedQFunctionField  *input_fields;
   CeedQFunctionField  *output_fields;
@@ -253,8 +273,7 @@ struct CeedQFunction_private {
 };
 
 struct CeedQFunctionContext_private {
-  Ceed ceed;
-  int  ref_count;
+  CeedObject_private obj;
   int (*HasValidData)(CeedQFunctionContext, bool *);
   int (*HasBorrowedDataOfType)(CeedQFunctionContext, CeedMemType, bool *);
   int (*SetData)(CeedQFunctionContext, CeedMemType, CeedCopyMode, void *);
@@ -329,9 +348,8 @@ struct CeedOperatorAssemblyData_private {
 };
 
 struct CeedOperator_private {
-  Ceed         ceed;
-  CeedOperator op_fallback, op_fallback_parent;
-  int          ref_count;
+  CeedObject_private obj;
+  CeedOperator       op_fallback, op_fallback_parent;
   int (*LinearAssembleQFunction)(CeedOperator, CeedVector *, CeedElemRestriction *, CeedRequest *);
   int (*LinearAssembleQFunctionUpdate)(CeedOperator, CeedVector, CeedElemRestriction, CeedRequest *);
   int (*LinearAssembleDiagonal)(CeedOperator, CeedVector, CeedRequest *);
@@ -364,6 +382,7 @@ struct CeedOperator_private {
   bool                      is_composite;
   bool                      is_at_points;
   bool                      has_restriction;
+  bool                      is_sequential;
   CeedQFunctionAssemblyData qf_assembled;
   CeedOperatorAssemblyData  op_assembled;
   CeedOperator             *sub_operators;
diff --git a/include/ceed.h b/include/ceed.h
index effe28eaf1..b905b30851 100644
--- a/include/ceed.h
+++ b/include/ceed.h
@@ -1 +1,5 @@
+#ifdef CEED_RUNNING_JIT_PASS
+#include "ceed/types.h"
+#else
 #include "ceed/ceed.h"
+#endif
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index 72868c1ff0..87f5f32c25 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -74,8 +74,12 @@
 #define CeedPragmaCritical(x) CeedPragmaOMP(critical(x))
 #endif
 
+/// This macro provides the tab width for viewing Ceed objects.
+/// @ingroup Ceed
+#define CEED_TAB_WIDTH 2
+
 /**
-  This enum supples common colors for CeedDebug256 debugging output.
+  This enum supplies common colors for CeedDebug256 debugging output.
   Set the environment variable `CEED_DEBUG = 1` to activate debugging output.
 
   @ingroup Ceed
@@ -142,8 +146,10 @@ CEED_EXTERN bool CeedDebugFlagEnv(void);
   @ingroup Ceed
   @ref     Backend
 **/
-#define CeedWarn(...) \
-  { CeedDebugImpl256(CEED_DEBUG_COLOR_WARNING, ##__VA_ARGS__); }
+#define CeedWarn(...)                                          \
+  {                                                            \
+    CeedDebugImpl256(CEED_DEBUG_COLOR_WARNING, ##__VA_ARGS__); \
+  }
 
 /**
   Swap the values of two CeedScalars
@@ -180,6 +186,11 @@ CEED_INTERN int CeedReallocArray(size_t n, size_t unit, void *p);
 CEED_INTERN int CeedStringAllocCopy(const char *source, char **copy);
 CEED_INTERN int CeedFree(void *p);
 
+CEED_INTERN int CeedObjectCreate(Ceed ceed, int (*view_function)(CeedObject, FILE *), int (*destroy_function)(CeedObject *), CeedObject obj);
+CEED_INTERN int CeedObjectReference(CeedObject obj);
+CEED_INTERN int CeedObjectDereference(CeedObject obj);
+CEED_INTERN int CeedObjectDestroy_Private(CeedObject obj);
+
 CEED_INTERN int CeedSetHostBoolArray(const bool *source_array, CeedCopyMode copy_mode, CeedSize num_values, const bool **target_array_owned,
                                      const bool **target_array_borrowed, const bool **target_array);
 CEED_INTERN int CeedSetHostCeedInt8Array(const CeedInt8 *source_array, CeedCopyMode copy_mode, CeedSize num_values,
@@ -244,14 +255,23 @@ CEED_EXTERN int CeedGetDelegate(Ceed ceed, Ceed *delegate);
 CEED_EXTERN int CeedSetDelegate(Ceed ceed, Ceed delegate);
 CEED_EXTERN int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, const char *obj_name);
 CEED_EXTERN int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name);
-CEED_EXTERN int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource);
 CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed);
-CEED_EXTERN int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource);
+CEED_EXTERN int CeedSetOperatorFallbackCeed(Ceed ceed, Ceed fallback_ceed);
 CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic);
 CEED_EXTERN int CeedSetBackendFunctionImpl(Ceed ceed, const char *type, void *object, const char *func_name, void (*f)(void));
 CEED_EXTERN int CeedGetData(Ceed ceed, void *data);
 CEED_EXTERN int CeedSetData(Ceed ceed, void *data);
 CEED_EXTERN int CeedReference(Ceed ceed);
+CEED_EXTERN int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec);
+CEED_EXTERN int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec);
+CEED_EXTERN int CeedClearWorkVectors(Ceed ceed, CeedSize min_len);
+CEED_EXTERN int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb);
+CEED_EXTERN int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***jit_source_roots);
+CEED_EXTERN int CeedGetRustSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***rust_source_roots);
+CEED_EXTERN int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots);
+CEED_EXTERN int CeedRestoreRustSourceRoots(Ceed ceed, const char ***rust_source_roots);
+CEED_EXTERN int CeedGetJitDefines(Ceed ceed, CeedInt *num_defines, const char ***jit_defines);
+CEED_EXTERN int CeedRestoreJitDefines(Ceed ceed, const char ***jit_defines);
 
 CEED_EXTERN int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array);
 CEED_EXTERN int CeedVectorHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type);
@@ -261,8 +281,12 @@ CEED_EXTERN int CeedVectorGetData(CeedVector vec, void *data);
 CEED_EXTERN int CeedVectorSetData(CeedVector vec, void *data);
 CEED_EXTERN int CeedVectorReference(CeedVector vec);
 
-/// Type of element restriction;
-/// @ingroup CeedElemRestriction
+/**
+  Specify type of restriction operation.
+
+  @ingroup CeedElemRestriction
+  @ref     Backend
+**/
 typedef enum {
   /// Standard element restriction with offsets
   CEED_RESTRICTION_STANDARD = 1,
@@ -278,7 +302,7 @@ typedef enum {
 
 CEED_EXTERN int CeedElemRestrictionGetType(CeedElemRestriction rstr, CeedRestrictionType *rstr_type);
 CEED_EXTERN int CeedElemRestrictionIsStrided(CeedElemRestriction rstr, bool *is_strided);
-CEED_EXTERN int CeedElemRestrictionIsPoints(CeedElemRestriction rstr, bool *is_points);
+CEED_EXTERN int CeedElemRestrictionIsAtPoints(CeedElemRestriction rstr, bool *is_points);
 CEED_EXTERN int CeedElemRestrictionAtPointsAreCompatible(CeedElemRestriction rstr_a, CeedElemRestriction rstr_b, bool *are_compatible);
 CEED_EXTERN int CeedElemRestrictionGetStrides(CeedElemRestriction rstr, CeedInt strides[3]);
 CEED_EXTERN int CeedElemRestrictionHasBackendStrides(CeedElemRestriction rstr, bool *has_backend_strides);
@@ -292,13 +316,19 @@ CEED_EXTERN int CeedElemRestrictionGetLLayout(CeedElemRestriction rstr, CeedInt
 CEED_EXTERN int CeedElemRestrictionSetLLayout(CeedElemRestriction rstr, CeedInt layout[3]);
 CEED_EXTERN int CeedElemRestrictionGetELayout(CeedElemRestriction rstr, CeedInt layout[3]);
 CEED_EXTERN int CeedElemRestrictionSetELayout(CeedElemRestriction rstr, CeedInt layout[3]);
+CEED_EXTERN int CeedElemRestrictionGetAtPointsElementOffset(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset);
+CEED_EXTERN int CeedElemRestrictionSetAtPointsEVectorSize(CeedElemRestriction rstr, CeedSize e_size);
 CEED_EXTERN int CeedElemRestrictionGetData(CeedElemRestriction rstr, void *data);
 CEED_EXTERN int CeedElemRestrictionSetData(CeedElemRestriction rstr, void *data);
 CEED_EXTERN int CeedElemRestrictionReference(CeedElemRestriction rstr);
 CEED_EXTERN int CeedElemRestrictionGetFlopsEstimate(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedSize *flops);
 
-/// Type of FE space;
-/// @ingroup CeedBasis
+/**
+  Specify type of FE space.
+
+  @ingroup CeedBasis
+  @ref     Backend
+**/
 typedef enum {
   /// H^1 FE space
   CEED_FE_SPACE_H1 = 1,
@@ -310,16 +340,22 @@ typedef enum {
 CEED_EXTERN const char *const CeedFESpaces[];
 
 CEED_EXTERN int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *colo_grad_1d);
+CEED_EXTERN int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_1d);
 CEED_EXTERN int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor);
+CEED_EXTERN int CeedBasisIsCollocated(CeedBasis basis, bool *is_collocated);
 CEED_EXTERN int CeedBasisGetData(CeedBasis basis, void *data);
 CEED_EXTERN int CeedBasisSetData(CeedBasis basis, void *data);
 CEED_EXTERN int CeedBasisReference(CeedBasis basis);
 CEED_EXTERN int CeedBasisGetNumQuadratureComponents(CeedBasis basis, CeedEvalMode eval_mode, CeedInt *q_comp);
-CEED_EXTERN int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedSize *flops);
+CEED_EXTERN int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, bool is_at_points, CeedInt num_points,
+                                          CeedSize *flops);
 CEED_EXTERN int CeedBasisGetFESpace(CeedBasis basis, CeedFESpace *fe_space);
 CEED_EXTERN int CeedBasisGetTopologyDimension(CeedElemTopology topo, CeedInt *dim);
 CEED_EXTERN int CeedBasisGetTensorContract(CeedBasis basis, CeedTensorContract *contract);
 CEED_EXTERN int CeedBasisSetTensorContract(CeedBasis basis, CeedTensorContract contract);
+CEED_EXTERN int CeedBasisCreateH1Fallback(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts,
+                                          const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weights,
+                                          CeedBasis basis);
 
 CEED_EXTERN int  CeedTensorContractCreate(Ceed ceed, CeedTensorContract *contract);
 CEED_EXTERN int  CeedTensorContractApply(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const CeedScalar *__restrict__ t,
@@ -341,6 +377,7 @@ CEED_EXTERN int CeedQFunctionSetFortranStatus(CeedQFunction qf, bool status);
 CEED_EXTERN int CeedQFunctionGetVectorLength(CeedQFunction qf, CeedInt *vec_length);
 CEED_EXTERN int CeedQFunctionGetNumArgs(CeedQFunction qf, CeedInt *num_input_fields, CeedInt *num_output_fields);
 CEED_EXTERN int CeedQFunctionGetKernelName(CeedQFunction qf, const char **kernel_name);
+CEED_EXTERN int CeedQFunctionGetName(CeedQFunction qf, const char **name);
 CEED_EXTERN int CeedQFunctionGetSourcePath(CeedQFunction qf, const char **source_path);
 CEED_EXTERN int CeedQFunctionLoadSourceToBuffer(CeedQFunction qf, const char **source_buffer);
 CEED_EXTERN int CeedQFunctionGetUserFunction(CeedQFunction qf, CeedQFunctionUser *f);
@@ -390,6 +427,8 @@ CEED_EXTERN int  CeedQFunctionContextReference(CeedQFunctionContext ctx);
 
 CEED_EXTERN int CeedOperatorGetBasisPointer(CeedBasis basis, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar **basis_ptr);
 CEED_EXTERN int CeedOperatorCreateActivePointBlockRestriction(CeedElemRestriction rstr, CeedElemRestriction *pointblock_rstr);
+
+CEED_EXTERN int CeedOperatorGetQFunctionAssemblyData(CeedOperator op, CeedQFunctionAssemblyData *data);
 CEED_EXTERN int CeedQFunctionAssemblyDataCreate(Ceed ceed, CeedQFunctionAssemblyData *data);
 CEED_EXTERN int CeedQFunctionAssemblyDataReference(CeedQFunctionAssemblyData data);
 CEED_EXTERN int CeedQFunctionAssemblyDataSetReuse(CeedQFunctionAssemblyData data, bool reuse_assembly_data);
@@ -401,6 +440,7 @@ CEED_EXTERN int CeedQFunctionAssemblyDataSetObjects(CeedQFunctionAssemblyData da
 CEED_EXTERN int CeedQFunctionAssemblyDataGetObjects(CeedQFunctionAssemblyData data, CeedVector *vec, CeedElemRestriction *rstr);
 CEED_EXTERN int CeedQFunctionAssemblyDataDestroy(CeedQFunctionAssemblyData *data);
 
+CEED_EXTERN int CeedOperatorGetOperatorAssemblyData(CeedOperator op, CeedOperatorAssemblyData *data);
 CEED_EXTERN int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssemblyData *data);
 CEED_EXTERN int CeedOperatorAssemblyDataGetEvalModes(CeedOperatorAssemblyData data, CeedInt *num_active_bases_in, CeedInt **num_eval_modes_in,
                                                      const CeedEvalMode ***eval_modes_in, CeedSize ***eval_mode_offsets_in,
@@ -415,7 +455,6 @@ CEED_EXTERN int CeedOperatorAssemblyDataGetElemRestrictions(CeedOperatorAssembly
                                                             CeedElemRestriction **active_elem_rstrs_out);
 CEED_EXTERN int CeedOperatorAssemblyDataDestroy(CeedOperatorAssemblyData *data);
 
-CEED_EXTERN int CeedOperatorGetOperatorAssemblyData(CeedOperator op, CeedOperatorAssemblyData *data);
 CEED_EXTERN int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis);
 CEED_EXTERN int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, CeedBasis *active_output_basis);
 CEED_EXTERN int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *active_rstr);
@@ -433,6 +472,9 @@ CEED_EXTERN int CeedOperatorReference(CeedOperator op);
 CEED_EXTERN int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback);
 CEED_EXTERN int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent);
 CEED_EXTERN int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent);
+CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
+                                                                         CeedRequest *request);
+CEED_INTERN int CeedOperatorAssembleSingle(CeedOperator op, CeedInt offset, CeedVector values);
 CEED_EXTERN int CeedOperatorSetSetupDone(CeedOperator op);
 
 CEED_INTERN int CeedMatrixMatrixMultiply(Ceed ceed, const CeedScalar *mat_A, const CeedScalar *mat_B, CeedScalar *mat_C, CeedInt m, CeedInt n,
diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h
index e605c47a4b..ede7251d81 100644
--- a/include/ceed/ceed-f32.h
+++ b/include/ceed/ceed-f32.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,9 @@
 /// @file
 /// Public header for definitions related to using FP32 floating point (single precision) for CeedScalar.
 /// Include this header in ceed.h to use float instead of double.
-#ifndef CEED_F32_H
-#define CEED_F32_H
+#pragma once
+
+#define CEED_SCALAR_IS_FP32
 
 /// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.)
 #define CEED_SCALAR_TYPE CEED_SCALAR_FP32
@@ -17,5 +18,3 @@ typedef float CeedScalar;
 
 /// Machine epsilon
 #define CEED_EPSILON 6e-08
-
-#endif  // CEED_F32_H
diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h
index 3e6876cc19..88e37972f9 100644
--- a/include/ceed/ceed-f64.h
+++ b/include/ceed/ceed-f64.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,9 @@
 /// @file
 /// Public header for definitions related to using FP64 floating point (double precision) for CeedScalar.
 /// This is the default header included in ceed.h.
-#ifndef CEED_F64_H
-#define CEED_F64_H
+#pragma once
+
+#define CEED_SCALAR_IS_FP64
 
 /// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.)
 #define CEED_SCALAR_TYPE CEED_SCALAR_FP64
@@ -17,5 +18,3 @@ typedef double CeedScalar;
 
 /// Machine epsilon
 #define CEED_EPSILON 1e-16
-
-#endif  // CEED_F64_H
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index 9ebb40534d..a76b9238c3 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
@@ -99,6 +99,16 @@ typedef struct CeedContextFieldLabel_private *CeedContextFieldLabel;
 /// Given an element restriction \f$E\f$, basis evaluator \f$B\f$, and quadrature function\f$f\f$, a `CeedOperator` expresses operations of the form \f$E^T B^T f(B E u)\f$ acting on the vector \f$u\f$.
 /// @ingroup CeedOperatorUser
 typedef struct CeedOperator_private *CeedOperator;
+/// Generic type for all libCEED objects to support common functionality, such as viewing
+/// @ingroup CeedUser
+typedef struct CeedObject_private *CeedObject;
+
+CEED_EXTERN int  CeedObjectView(CeedObject obj, FILE *stream);
+CEED_EXTERN int  CeedObjectSetNumViewTabs(CeedObject obj, CeedInt num_tabs);
+CEED_EXTERN int  CeedObjectGetNumViewTabs(CeedObject obj, CeedInt *num_tabs);
+CEED_EXTERN int  CeedObjectGetCeed(CeedObject obj, Ceed *ceed);
+CEED_EXTERN Ceed CeedObjectReturnCeed(CeedObject obj);
+CEED_EXTERN int  CeedObjectDestroy(CeedObject *obj);
 
 CEED_EXTERN int CeedRegistryGetList(size_t *n, char ***const resources, CeedInt **array);
 CEED_EXTERN int CeedInit(const char *resource, Ceed *ceed);
@@ -107,6 +117,10 @@ CEED_EXTERN int CeedReferenceCopy(Ceed ceed, Ceed *ceed_copy);
 CEED_EXTERN int CeedGetResource(Ceed ceed, const char **resource);
 CEED_EXTERN int CeedIsDeterministic(Ceed ceed, bool *is_deterministic);
 CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root);
+CEED_EXTERN int CeedAddRustSourceRoot(Ceed ceed, const char *rust_source_root);
+CEED_EXTERN int CeedAddJitDefine(Ceed ceed, const char *jit_define);
+CEED_EXTERN int CeedSetNumViewTabs(Ceed ceed, CeedInt num_tabs);
+CEED_EXTERN int CeedGetNumViewTabs(Ceed ceed, CeedInt *num_tabs);
 CEED_EXTERN int CeedView(Ceed ceed, FILE *stream);
 CEED_EXTERN int CeedDestroy(Ceed *ceed);
 CEED_EXTERN int CeedErrorImpl(Ceed ceed, const char *filename, int lineno, const char *func, int ecode, const char *format, ...);
@@ -162,6 +176,11 @@ CEED_EXTERN int CeedErrorExit(Ceed ceed, const char *filename, int line_no, cons
     (CEED_VERSION_MAJOR == major && (CEED_VERSION_MINOR > minor || (CEED_VERSION_MINOR == minor && CEED_VERSION_PATCH >= patch)))))
 
 CEED_EXTERN int CeedGetVersion(int *major, int *minor, int *patch, bool *release);
+CEED_EXTERN int CeedGetGitVersion(const char **git_version);
+CEED_EXTERN int CeedGetBuildConfiguration(const char **build_config);
+
+CEED_EXTERN int CeedSetIsClang(Ceed ceed, bool isClang);
+CEED_EXTERN int CeedGetIsClang(Ceed ceed, bool *isClang);
 
 CEED_EXTERN int CeedGetScalarType(CeedScalarType *scalar_type);
 
@@ -180,8 +199,10 @@ CEED_EXTERN int CeedGetPreferredMemType(Ceed ceed, CeedMemType *type);
 CEED_EXTERN int  CeedVectorCreate(Ceed ceed, CeedSize len, CeedVector *vec);
 CEED_EXTERN int  CeedVectorReferenceCopy(CeedVector vec, CeedVector *vec_copy);
 CEED_EXTERN int  CeedVectorCopy(CeedVector vec, CeedVector vec_copy);
+CEED_EXTERN int  CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedVector vec_copy);
 CEED_EXTERN int  CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array);
 CEED_EXTERN int  CeedVectorSetValue(CeedVector vec, CeedScalar value);
+CEED_EXTERN int  CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar value);
 CEED_EXTERN int  CeedVectorSyncArray(CeedVector vec, CeedMemType mem_type);
 CEED_EXTERN int  CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array);
 CEED_EXTERN int  CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array);
@@ -195,6 +216,8 @@ CEED_EXTERN int  CeedVectorAXPY(CeedVector y, CeedScalar alpha, CeedVector x);
 CEED_EXTERN int  CeedVectorAXPBY(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x);
 CEED_EXTERN int  CeedVectorPointwiseMult(CeedVector w, CeedVector x, CeedVector y);
 CEED_EXTERN int  CeedVectorReciprocal(CeedVector vec);
+CEED_EXTERN int  CeedVectorSetNumViewTabs(CeedVector vec, CeedInt num_tabs);
+CEED_EXTERN int  CeedVectorGetNumViewTabs(CeedVector vec, CeedInt *num_tabs);
 CEED_EXTERN int  CeedVectorViewRange(CeedVector vec, CeedSize start, CeedSize stop, CeedInt step, const char *fp_fmt, FILE *stream);
 CEED_EXTERN int  CeedVectorView(CeedVector vec, const char *fp_fmt, FILE *stream);
 CEED_EXTERN int  CeedVectorGetCeed(CeedVector vec, Ceed *ceed);
@@ -276,12 +299,16 @@ CEED_EXTERN int  CeedElemRestrictionGetElementSize(CeedElemRestriction rstr, Cee
 CEED_EXTERN int  CeedElemRestrictionGetNumPoints(CeedElemRestriction rstr, CeedInt *num_points);
 CEED_EXTERN int  CeedElemRestrictionGetNumPointsInElement(CeedElemRestriction rstr, CeedInt elem, CeedInt *num_points);
 CEED_EXTERN int  CeedElemRestrictionGetMaxPointsInElement(CeedElemRestriction rstr, CeedInt *max_points);
+CEED_EXTERN int  CeedElemRestrictionGetMinPointsInElement(CeedElemRestriction rstr, CeedInt *min_points);
+CEED_EXTERN int  CeedElemRestrictionGetMinMaxPointsInElement(CeedElemRestriction rstr, CeedInt *min_points, CeedInt *max_points);
 CEED_EXTERN int  CeedElemRestrictionGetLVectorSize(CeedElemRestriction rstr, CeedSize *l_size);
 CEED_EXTERN int  CeedElemRestrictionGetEVectorSize(CeedElemRestriction rstr, CeedSize *e_size);
 CEED_EXTERN int  CeedElemRestrictionGetNumComponents(CeedElemRestriction rstr, CeedInt *num_comp);
 CEED_EXTERN int  CeedElemRestrictionGetNumBlocks(CeedElemRestriction rstr, CeedInt *num_block);
 CEED_EXTERN int  CeedElemRestrictionGetBlockSize(CeedElemRestriction rstr, CeedInt *block_size);
 CEED_EXTERN int  CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult);
+CEED_EXTERN int  CeedElemRestrictionSetNumViewTabs(CeedElemRestriction rstr, CeedInt num_tabs);
+CEED_EXTERN int  CeedElemRestrictionGetNumViewTabs(CeedElemRestriction rstr, CeedInt *num_tabs);
 CEED_EXTERN int  CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream);
 CEED_EXTERN int  CeedElemRestrictionDestroy(CeedElemRestriction *rstr);
 
@@ -289,23 +316,28 @@ CEED_EXTERN int  CeedElemRestrictionDestroy(CeedElemRestriction *rstr);
 //  \int_\Omega v^T f_0(u, \nabla u, qdata) + (\nabla v)^T f_1(u, \nabla u, qdata)
 // where gradients are with respect to the reference element.
 
-CEED_EXTERN int  CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P, CeedInt Q, CeedQuadMode quad_mode,
-                                                 CeedBasis *basis);
-CEED_EXTERN int  CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d,
-                                         const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis *basis);
-CEED_EXTERN int  CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp,
-                                   const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
-CEED_EXTERN int  CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp,
-                                     const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
-CEED_EXTERN int  CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp,
-                                      const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
-CEED_EXTERN int  CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project);
-CEED_EXTERN int  CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy);
-CEED_EXTERN int  CeedBasisView(CeedBasis basis, FILE *stream);
-CEED_EXTERN int  CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
-CEED_EXTERN int  CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref,
-                                        CeedVector u, CeedVector v);
-CEED_EXTERN int  CeedBasisGetCeed(CeedBasis basis, Ceed *ceed);
+CEED_EXTERN int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P, CeedInt Q, CeedQuadMode quad_mode,
+                                                CeedBasis *basis);
+CEED_EXTERN int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d,
+                                        const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis *basis);
+CEED_EXTERN int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp,
+                                  const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
+CEED_EXTERN int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp,
+                                    const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
+CEED_EXTERN int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp,
+                                     const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis);
+CEED_EXTERN int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project);
+CEED_EXTERN int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy);
+CEED_EXTERN int CeedBasisSetNumViewTabs(CeedBasis basis, CeedInt num_tabs);
+CEED_EXTERN int CeedBasisGetNumViewTabs(CeedBasis basis, CeedInt *num_tabs);
+CEED_EXTERN int CeedBasisView(CeedBasis basis, FILE *stream);
+CEED_EXTERN int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
+CEED_EXTERN int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v);
+CEED_EXTERN int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                       CeedVector x_ref, CeedVector u, CeedVector v);
+CEED_EXTERN int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                          CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v);
+CEED_EXTERN int CeedBasisGetCeed(CeedBasis basis, Ceed *ceed);
 CEED_EXTERN Ceed CeedBasisReturnCeed(CeedBasis basis);
 CEED_EXTERN int  CeedBasisGetDimension(CeedBasis basis, CeedInt *dim);
 CEED_EXTERN int  CeedBasisGetTopology(CeedBasis basis, CeedElemTopology *topo);
@@ -354,16 +386,17 @@ CEED_EXTERN int  CeedQFunctionGetFields(CeedQFunction qf, CeedInt *num_input_fie
 CEED_EXTERN int  CeedQFunctionSetContext(CeedQFunction qf, CeedQFunctionContext ctx);
 CEED_EXTERN int  CeedQFunctionSetContextWritable(CeedQFunction qf, bool is_writable);
 CEED_EXTERN int  CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops);
+CEED_EXTERN int  CeedQFunctionSetNumViewTabs(CeedQFunction qf, CeedInt num_tabs);
+CEED_EXTERN int  CeedQFunctionGetNumViewTabs(CeedQFunction qf, CeedInt *num_tabs);
 CEED_EXTERN int  CeedQFunctionView(CeedQFunction qf, FILE *stream);
 CEED_EXTERN int  CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed);
 CEED_EXTERN Ceed CeedQFunctionReturnCeed(CeedQFunction qf);
 CEED_EXTERN int  CeedQFunctionApply(CeedQFunction qf, CeedInt Q, CeedVector *u, CeedVector *v);
 CEED_EXTERN int  CeedQFunctionDestroy(CeedQFunction *qf);
-
-CEED_EXTERN int CeedQFunctionFieldGetName(CeedQFunctionField qf_field, const char **field_name);
-CEED_EXTERN int CeedQFunctionFieldGetSize(CeedQFunctionField qf_field, CeedInt *size);
-CEED_EXTERN int CeedQFunctionFieldGetEvalMode(CeedQFunctionField qf_field, CeedEvalMode *eval_mode);
-CEED_EXTERN int CeedQFunctionFieldGetData(CeedQFunctionField qf_field, const char **field_name, CeedInt *size, CeedEvalMode *eval_mode);
+CEED_EXTERN int  CeedQFunctionFieldGetName(CeedQFunctionField qf_field, const char **field_name);
+CEED_EXTERN int  CeedQFunctionFieldGetSize(CeedQFunctionField qf_field, CeedInt *size);
+CEED_EXTERN int  CeedQFunctionFieldGetEvalMode(CeedQFunctionField qf_field, CeedEvalMode *eval_mode);
+CEED_EXTERN int  CeedQFunctionFieldGetData(CeedQFunctionField qf_field, const char **field_name, CeedInt *size, CeedEvalMode *eval_mode);
 
 /** Handle for the user provided @ref CeedQFunctionContextDestroy() callback function
 
@@ -393,13 +426,15 @@ CEED_EXTERN int CeedQFunctionContextGetAllFieldLabels(CeedQFunctionContext ctx,
 CEED_EXTERN int CeedContextFieldLabelGetDescription(CeedContextFieldLabel label, const char **field_name, size_t *field_offset, size_t *num_values,
                                                     const char **field_description, CeedContextFieldType *field_type);
 CEED_EXTERN int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_size);
+CEED_EXTERN int CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext ctx, CeedInt num_tabs);
+CEED_EXTERN int CeedQFunctionContextGetNumViewTabs(CeedQFunctionContext ctx, CeedInt *num_tabs);
 CEED_EXTERN int CeedQFunctionContextView(CeedQFunctionContext ctx, FILE *stream);
 CEED_EXTERN int CeedQFunctionContextSetDataDestroy(CeedQFunctionContext ctx, CeedMemType f_mem_type, CeedQFunctionContextDataDestroyUser f);
 CEED_EXTERN int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx);
 
 CEED_EXTERN int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunction dqfT, CeedOperator *op);
 CEED_EXTERN int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunction dqfT, CeedOperator *op);
-CEED_EXTERN int CeedCompositeOperatorCreate(Ceed ceed, CeedOperator *op);
+CEED_EXTERN int CeedOperatorCreateComposite(Ceed ceed, CeedOperator *op);
 CEED_EXTERN int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy);
 CEED_EXTERN int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction rstr, CeedBasis basis, CeedVector vec);
 CEED_EXTERN int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperatorField **input_fields, CeedInt *num_output_fields,
@@ -408,9 +443,12 @@ CEED_EXTERN int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields
 CEED_EXTERN int  CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_points, CeedVector point_coords);
 CEED_EXTERN int  CeedOperatorAtPointsGetPoints(CeedOperator op, CeedElemRestriction *rstr_points, CeedVector *point_coords);
 CEED_EXTERN int  CeedOperatorIsAtPoints(CeedOperator op, bool *is_at_points);
-CEED_EXTERN int  CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op);
-CEED_EXTERN int  CeedCompositeOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators);
-CEED_EXTERN int  CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operators);
+CEED_EXTERN int  CeedOperatorCompositeAddSub(CeedOperator composite_op, CeedOperator sub_op);
+CEED_EXTERN int  CeedOperatorCompositeGetNumSub(CeedOperator op, CeedInt *num_suboperators);
+CEED_EXTERN int  CeedOperatorCompositeGetSubList(CeedOperator op, CeedOperator **sub_operators);
+CEED_EXTERN int  CeedOperatorCompositeGetSubByName(CeedOperator op, const char *op_name, CeedOperator *sub_op);
+CEED_EXTERN int  CeedOperatorCompositeSetSequential(CeedOperator op, bool is_sequential);
+CEED_EXTERN int  CeedOperatorCompositeIsSequential(CeedOperator op, bool *is_sequential);
 CEED_EXTERN int  CeedOperatorCheckReady(CeedOperator op);
 CEED_EXTERN int  CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size);
 CEED_EXTERN int  CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data);
@@ -424,8 +462,9 @@ CEED_EXTERN int  CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, C
 CEED_EXTERN int  CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols);
 CEED_EXTERN int  CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols);
+CEED_EXTERN int  CeedOperatorLinearAssembleGetNumEntries(CeedOperator op, CeedSize *num_entries);
 CEED_EXTERN int  CeedOperatorLinearAssemble(CeedOperator op, CeedVector values);
-CEED_EXTERN int  CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult);
+CEED_EXTERN int  CeedOperatorCompositeGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult);
 CEED_EXTERN int  CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse,
                                                   CeedBasis basis_coarse, CeedOperator *op_coarse, CeedOperator *op_prolong,
                                                   CeedOperator *op_restrict);
@@ -437,7 +476,11 @@ CEED_EXTERN int  CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVe
                                                     CeedOperator *op_prolong, CeedOperator *op_restrict);
 CEED_EXTERN int  CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorSetName(CeedOperator op, const char *name);
+CEED_EXTERN int  CeedOperatorGetName(CeedOperator op, const char **name);
+CEED_EXTERN int  CeedOperatorSetNumViewTabs(CeedOperator op, CeedInt num_tabs);
+CEED_EXTERN int  CeedOperatorGetNumViewTabs(CeedOperator op, CeedInt *num_tabs);
 CEED_EXTERN int  CeedOperatorView(CeedOperator op, FILE *stream);
+CEED_EXTERN int  CeedOperatorViewTerse(CeedOperator op, FILE *stream);
 CEED_EXTERN int  CeedOperatorGetCeed(CeedOperator op, Ceed *ceed);
 CEED_EXTERN Ceed CeedOperatorReturnCeed(CeedOperator op);
 CEED_EXTERN int  CeedOperatorGetNumElements(CeedOperator op, CeedInt *num_elem);
@@ -456,8 +499,13 @@ CEED_EXTERN int  CeedOperatorGetContextBooleanRead(CeedOperator op, CeedContextF
 CEED_EXTERN int  CeedOperatorRestoreContextBooleanRead(CeedOperator op, CeedContextFieldLabel field_label, const bool **values);
 CEED_EXTERN int  CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request);
+CEED_EXTERN int  CeedOperatorApplyAddActive(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request);
+CEED_EXTERN int  CeedOperatorAssemblyDataStrip(CeedOperator op);
 CEED_EXTERN int  CeedOperatorDestroy(CeedOperator *op);
 
+// Compatibility with previous composite CeedOperator naming
+#include "deprecated.h"
+
 CEED_EXTERN int CeedOperatorGetFieldByName(CeedOperator op, const char *field_name, CeedOperatorField *op_field);
 CEED_EXTERN int CeedOperatorFieldGetName(CeedOperatorField op_field, const char **field_name);
 CEED_EXTERN int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, CeedElemRestriction *rstr);
diff --git a/include/ceed/cuda.h b/include/ceed/cuda.h
index 839e64fed7..eb9ac3e9cb 100644
--- a/include/ceed/cuda.h
+++ b/include/ceed/cuda.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/deprecated.h b/include/ceed/deprecated.h
new file mode 100644
index 0000000000..233b910a60
--- /dev/null
+++ b/include/ceed/deprecated.h
@@ -0,0 +1,38 @@
+/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+///
+/// SPDX-License-Identifier: BSD-2-Clause
+///
+/// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Public header for user and utility components of libCEED
+#pragma once
+
+#if __STDC_VERSION__ >= 202311L
+#define DEPRECATED(msg) [[deprecated(msg)]]
+#elif defined(__GNUC__) || defined(__clang__)
+#define DEPRECATED(msg) __attribute__((deprecated(msg)))
+#else
+#define DEPRECATED(msg)
+#endif
+
+// Compatibility with previous composite CeedOperator naming
+DEPRECATED("Use CeedOperatorCreateComposite()")
+static inline int CeedCompositeOperatorCreate(Ceed a, CeedOperator *b) { return CeedOperatorCreateComposite(a, b); }
+DEPRECATED("Use CeedOperatorCompositeAddSub()")
+static inline int CeedCompositeOperatorAddSub(CeedOperator a, CeedOperator b) { return CeedOperatorCompositeAddSub(a, b); }
+DEPRECATED("Use CeedOperatorCompositeGetNumSub()")
+static inline int CeedCompositeOperatorGetNumSub(CeedOperator a, CeedInt *b) { return CeedOperatorCompositeGetNumSub(a, b); }
+DEPRECATED("Use CeedOperatorCompositeGetSubList()")
+static inline int CeedCompositeOperatorGetSubList(CeedOperator a, CeedOperator **b) { return CeedOperatorCompositeGetSubList(a, b); }
+DEPRECATED("Use CeedOperatorCompositeGetSubByName()")
+static inline int CeedCompositeOperatorGetSubByName(CeedOperator a, const char *b, CeedOperator *c) {
+  return CeedOperatorCompositeGetSubByName(a, b, c);
+}
+DEPRECATED("Use CeedOperatorCompositeGetMultiplicity()")
+static inline int CeedCompositeOperatorGetMultiplicity(CeedOperator a, CeedInt b, CeedInt *c, CeedVector d) {
+  return CeedOperatorCompositeGetMultiplicity(a, b, c, d);
+}
+
+#undef DEPRECATED
diff --git a/include/ceed/fortran.h b/include/ceed/fortran.h
index bb7bcac396..ed0c0ef628 100644
--- a/include/ceed/fortran.h
+++ b/include/ceed/fortran.h
@@ -1,4 +1,4 @@
-! Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+! Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 ! All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 !
 ! SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/gen-tools.h b/include/ceed/gen-tools.h
new file mode 100644
index 0000000000..f1f3743000
--- /dev/null
+++ b/include/ceed/gen-tools.h
@@ -0,0 +1,27 @@
+#include <ceed.h>
+#include <sstream>
+
+class Tab {
+ private:
+  CeedInt       _num_tabs{0};
+  const CeedInt _width{2};
+
+  template <class OStream>
+  friend OStream &operator<<(OStream &os, const Tab &tab);
+
+ public:
+  Tab &push() {
+    _num_tabs++;
+    return *this;
+  }
+  Tab &pop() {
+    if (_num_tabs > 0) _num_tabs--;
+    return *this;
+  }
+};
+
+template <class OStream>
+OStream &operator<<(OStream &os, const Tab &tab) {
+  os << std::string(tab._num_tabs * tab._width, ' ');
+  return os;
+}
diff --git a/include/ceed/hip.h b/include/ceed/hip.h
index 2c0e156872..86ba7dc098 100644
--- a/include/ceed/hip.h
+++ b/include/ceed/hip.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
index da92667707..351c3be86c 100644
--- a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
+++ b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA atomic add fallback definition
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Atomic add, for older CUDA
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index f3d7052e3c..5fd998d9e9 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,30 +7,74 @@
 
 /// @file
 /// Internal header for CUDA backend macro and type definitions for JiT source
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Load matrices for basis actions
 //------------------------------------------------------------------------------
 template <int P, int Q>
-inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
+inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
 }
 
+//------------------------------------------------------------------------------
+// AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// L-vector -> single point
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
+inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+  const CeedInt ind = indices[p + elem * NUM_PTS];
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_u[comp] = d_u[ind + comp * COMP_STRIDE];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Single point -> L-vector
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
+inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
+  if (p < points_in_elem) {
+    const CeedInt ind = indices[p + elem * NUM_PTS];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_u[ind + comp * COMP_STRIDE] += r_u[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 1D
 //------------------------------------------------------------------------------
 
+//------------------------------------------------------------------------------
+// Set E-vector value
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D>
+inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+  const CeedInt target_comp = n / P_1D;
+  const CeedInt target_node = n % P_1D;
+
+  if (data.t_id_x == target_node) {
+    r_v[target_comp] = value;
+  }
+}
+
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                        const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d) {
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
-    const CeedInt ind  = indices[node + elem * P_1d];
+    const CeedInt ind  = indices[node + elem * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
   }
@@ -39,10 +83,10 @@ inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt num
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
                                          CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
@@ -53,24 +97,72 @@ inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt el
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                         const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d) {
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
-    const CeedInt ind  = indices[node + elem * P_1d];
+    const CeedInt ind  = indices[node + elem * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
   }
 }
 
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
+                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
+                                                  CeedScalar *__restrict__ d_v) {
+  const CeedInt target_comp = n / P_1D;
+  const CeedInt target_node = n % P_1D;
+
+  if (data.t_id_x == target_node) {
+    const CeedInt ind = indices[target_node + elem * P_1D];
+
+    atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, full assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
+                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  const CeedInt in_comp    = in / P_1D;
+  const CeedInt in_node    = in % P_1D;
+  const CeedInt e_vec_size = P_1D * NUM_COMP;
+
+  if (data.t_id_x < P_1D) {
+    const CeedInt out_node = data.t_id_x;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, Qfunction assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
+                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < Q_1D) {
+    const CeedInt ind = data.t_id_x + elem * Q_1D;
+
+    for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
+      d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
@@ -82,15 +174,29 @@ inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt e
 // 2D
 //------------------------------------------------------------------------------
 
+//------------------------------------------------------------------------------
+// Set E-vector value
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D>
+inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    r_v[target_comp] = value;
+  }
+}
+
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                        const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
-    const CeedInt ind  = indices[node + elem * P_1d * P_1d];
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
   }
@@ -99,11 +205,11 @@ inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt num
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
                                          CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
@@ -113,25 +219,80 @@ inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt el
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                         const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
-    const CeedInt ind  = indices[node + elem * P_1d * P_1d];
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
   }
 }
 
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
+                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
+                                                  CeedScalar *__restrict__ d_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
+
+    atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, full assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
+                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  const CeedInt elem_size  = P_1D * P_1D;
+  const CeedInt in_comp    = in / elem_size;
+  const CeedInt in_node_x  = in % P_1D;
+  const CeedInt in_node_y  = (in % elem_size) / P_1D;
+  const CeedInt e_vec_size = elem_size * NUM_COMP;
+
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt in_node  = in_node_x + in_node_y * P_1D;
+    const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
+
+      d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, Qfunction assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
+                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D;
+
+    for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
+      d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
@@ -142,52 +303,63 @@ inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt e
 // 3D
 //------------------------------------------------------------------------------
 
+//------------------------------------------------------------------------------
+// Set E-vector value
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D>
+inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
+  const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    r_v[target_node_z + target_comp * P_1D] = value;
+  }
+}
+
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-// TODO: remove "Dofs" and "Quads" in the following function names?
-//   - readDofsOffset3d -> readOffset3d ?
-//   - readDofsStrided3d -> readStrided3d ?
-//   - readSliceQuadsOffset3d -> readSliceOffset3d ?
-//   - readSliceQuadsStrided3d -> readSliceStrided3d ?
-//   - writeDofsOffset3d -> writeOffset3d ?
-//   - writeDofsStrided3d -> writeStrided3d ?
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                        const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
-      const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
-
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
     }
+  }
 }
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
                                          CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
     }
+  }
 }
 
 //------------------------------------------------------------------------------
 // E-vector -> Q-vector, offests provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
-inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
-                                              const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
-    const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
+template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
+inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
+                                               const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
+                                               CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
+    const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
   }
@@ -196,11 +368,11 @@ inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedI
 //------------------------------------------------------------------------------
 // E-vector -> Q-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
-                                               CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
+template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
+                                              CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
@@ -210,55 +382,122 @@ inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                         const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
-      const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
     }
+  }
+}
+
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
+                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
+                                                  CeedScalar *__restrict__ d_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
+  const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
+
+    atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, full assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
+                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  const CeedInt elem_size  = P_1D * P_1D * P_1D;
+  const CeedInt in_comp    = in / elem_size;
+  const CeedInt in_node_x  = in % P_1D;
+  const CeedInt in_node_y  = (in % (P_1D * P_1D)) / P_1D;
+  const CeedInt in_node_z  = (in % elem_size) / (P_1D * P_1D);
+  const CeedInt e_vec_size = elem_size * NUM_COMP;
+
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+        const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
+
+        d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, Qfunction assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
+                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    for (CeedInt z = 0; z < Q_1D; z++) {
+      const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D;
+
+      for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
+        d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D];
+      }
+    }
+  }
 }
 
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
     }
+  }
 }
 
 //------------------------------------------------------------------------------
 // 3D collocated derivatives computation
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1d>
-inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                   CeedScalar *__restrict__ r_V) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                        CeedScalar *__restrict__ r_V) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
+      __syncthreads();
+      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
       __syncthreads();
       // X derivative
       r_V[comp + 0 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
+      }
       // Y derivative
       r_V[comp + 1 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
+      }
       // Z derivative
       r_V[comp + 2 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];  // Contract z direction (Z derivative)
-      __syncthreads();
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
+      }
     }
   }
 }
@@ -266,26 +505,29 @@ inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const
 //------------------------------------------------------------------------------
 // 3D collocated derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1d>
-inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                            CeedScalar *__restrict__ r_V) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                                 CeedScalar *__restrict__ r_V) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      // X derivative
-      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
       __syncthreads();
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
+      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
       __syncthreads();
+      // X derivative
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
+      }
       // Y derivative
-      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
       __syncthreads();
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
+      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
       __syncthreads();
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
+      }
       // Z derivative
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];  // PARTIAL contract z direction (Z derivative)
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
+      }
     }
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-jit.h b/include/ceed/jit-source/cuda/cuda-jit.h
index 1aedb54dbe..d9cd5a8963 100644
--- a/include/ceed/jit-source/cuda/cuda-jit.h
+++ b/include/ceed/jit-source/cuda/cuda-jit.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -13,4 +13,8 @@
 #define CeedPragmaSIMD
 #define CEED_Q_VLA 1
 
+#define CEED_QFUNCTION_RUST(name)                                                                                       \
+  extern "C" __device__ int name##_rs(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out); \
+  static __device__ int name(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return name##_rs(ctx, Q, in, out); }
+
 #include "cuda-types.h"
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
index 64b57d0d68..2a4967f807 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA non-tensor product basis templates
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Tensor contraction
@@ -53,9 +52,9 @@ inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strid
     // Run with P threads
     r_V = 0.0;
     for (CeedInt d = 0; d < Q_COMP; d++) {
-      U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U;
+      U = &d_U[elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U];
       for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i];
     }
-    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V;
+    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V;
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
index 6dbf8771d8..c441e414ef 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA non-tensor product basis
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "cuda-ref-basis-nontensor-templates.h"
 
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
new file mode 100644
index 0000000000..602a6d1f40
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -0,0 +1,408 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA tensor product basis with AtPoints evaluation
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// Chebyshev values
+//------------------------------------------------------------------------------
+template <int Q_1D>
+inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
+  chebyshev_x[0] = 1.0;
+  chebyshev_x[1] = 2 * x;
+  for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
+}
+
+template <int Q_1D>
+inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
+  CeedScalar chebyshev_x[3];
+
+  chebyshev_x[1]  = 1.0;
+  chebyshev_x[2]  = 2 * x;
+  chebyshev_dx[0] = 0.0;
+  chebyshev_dx[1] = 2.0;
+  for (CeedInt i = 2; i < Q_1D; i++) {
+    chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
+    chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Tensor Basis Kernels AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interp
+//------------------------------------------------------------------------------
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                          const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                          const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_NODES;
+  const CeedInt v_stride      = BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt u_size        = BASIS_NUM_NODES;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt           pre   = u_size;
+      CeedInt           post  = 1;
+
+      // Map to coefficients
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        __syncthreads();
+        // Update buffers used
+        pre /= P;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * Q;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+          out[k] = v_k;
+        }
+        post *= Q;
+      }
+
+      // Map to point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        pre  = BASIS_NUM_QPTS;
+        post = 1;
+        for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          // Update buffers used
+          pre /= Q;
+          const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
+          CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
+
+          // Build Chebyshev polynomial values
+          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
+
+          // Contract along middle index
+          for (CeedInt a = 0; a < pre; a++) {
+            for (CeedInt c = 0; c < post; c++) {
+              CeedScalar v_k = 0;
+
+              for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
+              out[a * post + c] = v_k;
+            }
+          }
+          post *= 1;
+        }
+      }
+    }
+  }
+}
+
+extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_PTS;
+  const CeedInt v_stride      = BASIS_NUM_NODES;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt u_size        = BASIS_NUM_PTS;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt           pre   = 1;
+      CeedInt           post  = 1;
+
+      // Clear Chebyshev coeffs
+      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+        s_chebyshev_coeffs[k] = 0.0;
+      }
+
+      // Map from point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        if (p >= points_per_elem[elem]) continue;
+        pre  = 1;
+        post = 1;
+        for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          // Update buffers used
+          pre /= 1;
+          const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
+          CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
+
+          // Build Chebyshev polynomial values
+          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
+
+          // Contract along middle index
+          for (CeedInt a = 0; a < pre; a++) {
+            for (CeedInt c = 0; c < post; c++) {
+              if (d == BASIS_DIM - 1) {
+                for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
+              } else {
+                for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
+              }
+            }
+          }
+          post *= Q;
+        }
+      }
+
+      // Map from coefficients
+      pre  = BASIS_NUM_QPTS;
+      post = 1;
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        __syncthreads();
+        // Update buffers used
+        pre /= Q;
+        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * P;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % P;
+          const CeedInt a   = k / (post * P);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+          if (d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
+        }
+        post *= P;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_NODES;
+  const CeedInt v_stride      = BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt u_size        = BASIS_NUM_NODES;
+  const CeedInt u_dim_stride  = 0;
+  const CeedInt v_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedInt           pre   = u_size;
+      CeedInt           post  = 1;
+
+      // Map to coefficients
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        __syncthreads();
+        // Update buffers used
+        pre /= P;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * Q;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+          out[k] = v_k;
+        }
+        post *= Q;
+      }
+
+      // Map to point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+          CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
+
+          pre  = BASIS_NUM_QPTS;
+          post = 1;
+          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
+            // Update buffers used
+            pre /= Q;
+            const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2);
+
+            // Build Chebyshev polynomial values
+            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
+            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
+
+            // Contract along middle index
+            for (CeedInt a = 0; a < pre; a++) {
+              for (CeedInt c = 0; c < post; c++) {
+                CeedScalar v_k = 0;
+
+                for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
+                out[a * post + c] = v_k;
+              }
+            }
+            post *= 1;
+          }
+        }
+      }
+    }
+  }
+}
+
+extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                                 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_PTS;
+  const CeedInt v_stride      = BASIS_NUM_NODES;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt u_size        = BASIS_NUM_PTS;
+  const CeedInt u_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+  const CeedInt v_dim_stride  = 0;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt     pre   = 1;
+      CeedInt     post  = 1;
+
+      // Clear Chebyshev coeffs
+      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+        s_chebyshev_coeffs[k] = 0.0;
+      }
+
+      // Map from point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        if (p >= points_per_elem[elem]) continue;
+        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+          const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
+
+          pre  = 1;
+          post = 1;
+          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
+            // Update buffers used
+            pre /= 1;
+            const CeedScalar *in  = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
+
+            // Build Chebyshev polynomial values
+            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+
+            // Contract along middle index
+            for (CeedInt a = 0; a < pre; a++) {
+              for (CeedInt c = 0; c < post; c++) {
+                if (dim_2 == BASIS_DIM - 1) {
+                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
+                } else {
+                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
+                }
+              }
+            }
+            post *= Q;
+          }
+        }
+      }
+
+      // Map from coefficients
+      pre  = BASIS_NUM_QPTS;
+      post = 1;
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        __syncthreads();
+        // Update buffers used
+        pre /= Q;
+        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * P;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % P;
+          const CeedInt a   = k / (post * P);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+          if (d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
+        }
+        post *= P;
+      }
+    }
+  }
+}
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
index 7361c994e0..baa8554eda 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA tensor product basis
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Tensor Basis Kernels
@@ -17,7 +16,7 @@
 //------------------------------------------------------------------------------
 // Interp
 //------------------------------------------------------------------------------
-extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d,
+extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ interp_1d,
                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
@@ -29,44 +28,42 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos
     s_interp_1d[k] = interp_1d[k];
   }
 
-  const CeedInt P             = transpose ? BASIS_Q_1D : BASIS_P_1D;
-  const CeedInt Q             = transpose ? BASIS_P_1D : BASIS_Q_1D;
-  const CeedInt stride_0      = transpose ? 1 : BASIS_P_1D;
-  const CeedInt stride_1      = transpose ? BASIS_P_1D : 1;
-  const CeedInt u_stride      = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
-  const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
-  const CeedInt u_size        = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+  const CeedInt P             = is_transpose ? BASIS_Q_1D : BASIS_P_1D;
+  const CeedInt Q             = is_transpose ? BASIS_P_1D : BASIS_Q_1D;
+  const CeedInt stride_0      = is_transpose ? 1 : BASIS_P_1D;
+  const CeedInt stride_1      = is_transpose ? BASIS_P_1D : 1;
+  const CeedInt u_stride      = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
+  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
+  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
+  const CeedInt u_size        = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-      const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-      CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
       CeedInt           pre   = u_size;
       CeedInt           post  = 1;
 
-      for (CeedInt k = i; k < u_size; k += blockDim.x) {
-        s_buffer_1[k] = cur_u[k];
-      }
       for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
         // Update buffers used
         pre /= P;
-        const CeedScalar *in       = d % 2 ? s_buffer_2 : s_buffer_1;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
         CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
         const CeedInt     writeLen = pre * post * Q;
 
         // Contract along middle index
         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-          const CeedInt c  = k % post;
-          const CeedInt j  = (k / post) % Q;
-          const CeedInt a  = k / (post * Q);
-          CeedScalar    vk = 0;
-
-          for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
-          out[k] = vk;
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
+          if (is_transpose && d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
         }
         post *= Q;
       }
@@ -77,7 +74,7 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos
 //------------------------------------------------------------------------------
 // Grad
 //------------------------------------------------------------------------------
-extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d,
+extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ interp_1d,
                                 const CeedScalar *__restrict__ grad_1d, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
@@ -91,26 +88,26 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose,
     s_grad_1d[k]   = grad_1d[k];
   }
 
-  const CeedInt P             = transpose ? BASIS_Q_1D : BASIS_P_1D;
-  const CeedInt Q             = transpose ? BASIS_P_1D : BASIS_Q_1D;
-  const CeedInt stride_0      = transpose ? 1 : BASIS_P_1D;
-  const CeedInt stride_1      = transpose ? BASIS_P_1D : 1;
-  const CeedInt u_stride      = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
-  const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
-  const CeedInt u_dim_stride  = transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0;
-  const CeedInt v_dim_stride  = transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP;
+  const CeedInt P             = is_transpose ? BASIS_Q_1D : BASIS_P_1D;
+  const CeedInt Q             = is_transpose ? BASIS_P_1D : BASIS_Q_1D;
+  const CeedInt stride_0      = is_transpose ? 1 : BASIS_P_1D;
+  const CeedInt stride_1      = is_transpose ? BASIS_P_1D : 1;
+  const CeedInt u_stride      = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
+  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
+  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
+  const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0;
+  const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP;
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
       // dim*dim contractions for grad
       for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-        CeedInt           pre   = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+        CeedInt           pre   = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
         CeedInt           post  = 1;
-        const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
 
         for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
           __syncthreads();
@@ -129,7 +126,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose,
             CeedScalar    v_k = 0;
 
             for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
-            if (transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k;
+            if (is_transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k;
             else out[k] = v_k;
           }
           post *= Q;
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
index df5b9ad338..5be93d9a1e 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,7 @@
 
 /// @file
 /// Internal header for CUDA operator diagonal assembly
-#include <ceed.h>
+#include <ceed/types.h>
 
 #if USE_CEEDSIZE
 typedef CeedSize IndexType;
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
index 6333f771f2..76643040fb 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA operator full assembly
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 #if USE_CEEDSIZE
 typedef CeedSize IndexType;
@@ -24,7 +23,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
                         const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out,
                         const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) {
   extern __shared__ CeedScalar s_CT[];
-  CeedScalar                  *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN;
+  CeedScalar                  *s_C = &s_CT[NUM_NODES_OUT * NUM_NODES_IN];
 
   const int l = threadIdx.x;  // The output column index of each B^T D B operation
                               // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
@@ -62,7 +61,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
                 result += B_out[b_out_index + j * NUM_NODES_OUT + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES_IN + l];
               }
             }  // end of out eval mode
-          }    // end of in eval mode
+          }  // end of in eval mode
           if (orients_in) {
             result *= orients_in[NUM_NODES_IN * e + l] ? -1.0 : 1.0;
           }
@@ -101,6 +100,6 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
           }
         }
       }  // end of out component
-    }    // end of in component
-  }      // end of element loop
+    }  // end of in component
+  }  // end of element loop
 }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
index 7fbf7901bc..61785cc00c 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA backend QFunction read/write kernels
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Read from quadrature points
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
new file mode 100644
index 0000000000..73ecc3bb25
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-at-points.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA offset element restriction kernels
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, standard (with offsets)
+//------------------------------------------------------------------------------
+#if !USE_DETERMINISTIC
+extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices, const CeedInt *__restrict__ points_per_elem,
+                                             const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt ind      = indices[node];
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    if (loc_node >= points_per_elem[elem]) continue;
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+    }
+  }
+}
+#else
+extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
+                                             const CeedInt *__restrict__ points_per_elem, const CeedInt *__restrict__ t_offsets,
+                                             const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  CeedScalar value[RSTR_NUM_COMP];
+
+  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
+    const CeedInt ind     = l_vec_indices[i];
+    const CeedInt range_1 = t_offsets[i];
+    const CeedInt range_N = t_offsets[i + 1];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
+
+    for (CeedInt j = range_1; j < range_N; j++) {
+      const CeedInt t_ind    = t_indices[j];
+      const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE;
+      const CeedInt elem     = t_ind / RSTR_ELEM_SIZE;
+
+      if (loc_node >= points_per_elem[elem]) continue;
+      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE];
+      }
+    }
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
+  }
+}
+#endif
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
index d317f42cc5..e83eebb8cd 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA curl-oriented element restriction kernels
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, curl-oriented
@@ -80,7 +79,7 @@ extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ ind
       value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
       value +=
           loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
     }
   }
 }
@@ -138,7 +137,7 @@ extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restri
       value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
       value +=
           loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
     }
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
index 0bd3dc0dd8..487c4d2194 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA offset element restriction kernels
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, standard (with offsets)
@@ -36,7 +35,7 @@ extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices,
     const CeedInt elem     = node / RSTR_ELEM_SIZE;
 
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
     }
   }
 }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
index d36f27277e..ead457562a 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA oriented element restriction kernels
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, oriented
@@ -40,7 +39,7 @@ extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices
     const CeedInt elem     = node / RSTR_ELEM_SIZE;
 
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE,
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE],
                 u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0));
     }
   }
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
index d10f73c11d..c5dc12b227 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA strided element restriction kernels
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
new file mode 100644
index 0000000000..d49bc52a4b
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA shared memory non-tensor basis templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// 1D tensor contraction
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void Contract1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (data.t_id_x < Q_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 1D transpose tensor contraction
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x] = *U;
+  __syncthreads();
+  if (data.t_id_x < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// Interpolate to quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P, int Q, int T_1D>
+inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                       CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P, int Q, int T_1D>
+inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = 0.0;
+    ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int DIM, int P, int Q, int T_1D>
+inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  for (CeedInt dim = 0; dim < DIM; dim++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int DIM, int P, int Q, int T_1D>
+inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                              CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0;
+  for (CeedInt dim = 0; dim < DIM; dim++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp + dim * NUM_COMP], &c_G[dim * P * Q], &r_V[comp]);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Quadrature weights
+//------------------------------------------------------------------------------
+template <int P, int Q>
+inline __device__ void WeightNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) {
+  *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
new file mode 100644
index 0000000000..abddaa58cd
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
@@ -0,0 +1,200 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA shared memory non-tensor basis
+#include <ceed/types.h>
+
+#include "cuda-shared-basis-nontensor-templates.h"
+#include "cuda-shared-basis-read-write-templates.h"
+
+//------------------------------------------------------------------------------
+// Interp kernels
+//------------------------------------------------------------------------------
+extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load interp into shared memory
+  __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
+    InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
+  }
+}
+
+extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                           CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load interp into shared memory
+  __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                              CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load interp into shared memory
+  __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
+    SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad kernels
+//------------------------------------------------------------------------------
+extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
+
+  // load grad into shared memory
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
+  LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
+    GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
+  }
+}
+
+extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                         CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load grad into shared memory
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
+  LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                            CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load grad into shared memory
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
+  LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
+    SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Weight kernel
+//------------------------------------------------------------------------------
+extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_W[1];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    WeightNonTensor<BASIS_P, BASIS_Q>(data, q_weight, r_W);
+    WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);
+  }
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
index b10ba108f8..ececd93ae6 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,15 @@
 
 /// @file
 /// Internal header for CUDA shared memory basis read/write templates
+#include <ceed/types.h>
 
-#include <ceed.h>
+//------------------------------------------------------------------------------
+// Load matrices for basis actions
+//------------------------------------------------------------------------------
+template <int P, int Q>
+inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
+  for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
+}
 
 //------------------------------------------------------------------------------
 // 1D
@@ -46,6 +53,19 @@ inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedIn
   }
 }
 
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D) {
+    const CeedInt node = data.t_id_x;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 2D
 //------------------------------------------------------------------------------
@@ -82,6 +102,19 @@ inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedIn
   }
 }
 
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 3D
 //------------------------------------------------------------------------------
@@ -121,3 +154,58 @@ inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedIn
     }
   }
 }
+
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+        d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// E-vector -> single point
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_PTS>
+inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                 const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem,
+                                 const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+  const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
+
+  if (p < points_in_elem) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_u[comp] = d_u[ind + comp * strides_comp];
+    }
+  } else {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_u[comp] = 0.0;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Single point -> E-vector
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_PTS>
+inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                  const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v,
+                                  CeedScalar *d_v) {
+  if (p < points_in_elem) {
+    const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] = r_v[comp];
+    }
+  }
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
new file mode 100644
index 0000000000..6f2843acce
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -0,0 +1,467 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA shared memory tensor product basis AtPoints templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// Chebyshev values
+//------------------------------------------------------------------------------
+template <int Q_1D>
+inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
+  chebyshev_x[0] = 1.0;
+  chebyshev_x[1] = 2 * x;
+  for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
+}
+
+template <int Q_1D>
+inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
+  CeedScalar chebyshev_x[3];
+
+  chebyshev_x[1]  = 1.0;
+  chebyshev_x[2]  = 2 * x;
+  chebyshev_dx[0] = 0.0;
+  chebyshev_dx[1] = 2.0;
+  for (CeedInt i = 2; i < Q_1D; i++) {
+    chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
+    chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 1D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Load coefficients
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * data.slice[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                                 CeedScalar *__restrict__ r_C) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Clear shared memory
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
+    __syncthreads();
+    // Contract x direction
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Load coefficients
+    __syncthreads();
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * data.slice[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                               CeedScalar *__restrict__ r_C) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Clear shared memory
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
+    __syncthreads();
+    // Contract x direction
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 2D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Load coefficients
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      buffer[i] = 0.0;
+      for (CeedInt j = 0; j < Q_1D; j++) {
+        buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+      }
+    }
+    // Contract y direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * buffer[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                                 CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Clear shared memory
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    __syncthreads();
+    // Contract y direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
+
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      buffer[i] = chebyshev_x[i] * r_u;
+    }
+    // Contract x direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      // Note: shifting to avoid atomic adds
+      const CeedInt ii = (i + data.t_id_y) % Q_1D;
+
+      for (CeedInt j = 0; j < Q_1D; j++) {
+        const CeedInt jj = (j + data.t_id_x) % Q_1D;
+
+        if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Load coefficients
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    __syncthreads();
+    for (CeedInt dim = 0; dim < 2; dim++) {
+      // Contract x direction
+      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = 0.0;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+        }
+      }
+      // Contract y direction
+      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                               CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Clear shared memory
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    __syncthreads();
+    for (CeedInt dim = 0; dim < 2; dim++) {
+      // Contract y direction
+      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
+
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = chebyshev_x[i] * r_u;
+      }
+      // Contract x direction
+      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        // Note: shifting to avoid atomic adds
+        const CeedInt ii = (i + data.t_id_y) % Q_1D;
+
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          const CeedInt jj = (j + data.t_id_x) % Q_1D;
+
+          if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+        }
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 3D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Get z contraction value
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      // Load coefficients
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
+      __syncthreads();
+      // Contract x direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = 0.0;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+        }
+      }
+      // Contract y and z direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp] += chebyshev_x[i] * buffer[i] * z;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                                 CeedScalar *__restrict__ r_C) {
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Get z contraction value
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      // Clear shared memory
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+      __syncthreads();
+      // Contract y and z direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
+
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = chebyshev_x[i] * r_u * z;
+      }
+      // Contract x direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        // Note: shifting to avoid atomic adds
+        const CeedInt ii = (i + data.t_id_y) % Q_1D;
+
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          const CeedInt jj = (j + data.t_id_x) % Q_1D;
+
+          if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+        }
+      }
+      // Pull from shared to register
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Get z contraction values
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar dz = chebyshev_x[k];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      // Load coefficients
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
+      __syncthreads();
+      // Gradient directions
+      for (CeedInt dim = 0; dim < 3; dim++) {
+        // Contract x direction
+        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          buffer[i] = 0.0;
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+          }
+        }
+        // Contract y and z direction
+        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        const CeedScalar zz = dim == 2 ? dz : z;
+
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                               CeedScalar *__restrict__ r_C) {
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Get z contraction values
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar dz = chebyshev_x[k];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      // Clear shared memory
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+      __syncthreads();
+      // Gradient directions
+      for (CeedInt dim = 0; dim < 3; dim++) {
+        // Contract y and z direction
+        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        const CeedScalar zz  = dim == 2 ? dz : z;
+        const CeedScalar r_u = (p < NUM_POINTS) ? r_U[comp + dim * NUM_COMP] : 0.0;
+
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          buffer[i] = chebyshev_x[i] * r_u * zz;
+        }
+
+        // Contract x direction
+        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          // Note: shifting to avoid atomic adds
+          const CeedInt ii = (i + data.t_id_y) % Q_1D;
+
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            const CeedInt jj = (j + data.t_id_x) % Q_1D;
+
+            if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          }
+        }
+      }
+      // Pull from shared to register
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    }
+  }
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
new file mode 100644
index 0000000000..fc812792e4
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -0,0 +1,394 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA tensor product basis with AtPoints evaluation
+#include <ceed/types.h>
+
+#include "cuda-shared-basis-read-write-templates.h"
+#include "cuda-shared-basis-tensor-at-points-templates.h"
+#include "cuda-shared-basis-tensor-templates.h"
+
+//------------------------------------------------------------------------------
+// Tensor Basis Kernels AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interp
+//------------------------------------------------------------------------------
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
+                                          const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map to coefficients
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
+    }
+
+    // Map to points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      if (BASIS_DIM == 1) {
+        InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 2) {
+        InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 3) {
+        InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      }
+      WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
+                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
+                                                   const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Clear output vector
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0;
+    if (BASIS_DIM == 1) {
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+
+    // Map from points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
+      if (BASIS_DIM == 1) {
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
+                                                      const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
+                                                      const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Map from points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
+      if (BASIS_DIM == 1) {
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
+                                        const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
+
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map to coefficients
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
+    }
+
+    // Map to points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      if (BASIS_DIM == 1) {
+        GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 2) {
+        GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 3) {
+        GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      }
+      WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
+                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
+                                                 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Clear output vector
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0;
+    if (BASIS_DIM == 1) {
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+
+    // Map from points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
+                                                           r_U);
+      if (BASIS_DIM == 1) {
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
+                                                    const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
+                                                    const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Map from points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
+                                                           r_U);
+      if (BASIS_DIM == 1) {
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
new file mode 100644
index 0000000000..54594b3af4
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
@@ -0,0 +1,680 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA shared memory tensor product basis templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// 2D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 2D tensor contraction x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                            CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                            CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D transpose tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D transpose tensor contract x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < P_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D transpose tensor contract and add x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
+                                                        const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < P_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D pack/unpack quadrature values
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+  const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
+  }
+}
+
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+  const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
+    if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate to quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                       CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                     const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D quadrature weights
+//------------------------------------------------------------------------------
+template <int P_1D, int Q_1D>
+inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+  const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
+
+  *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
+}
+
+//------------------------------------------------------------------------------
+// 3D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 3D tensor contract x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                            const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                            const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D tensor contract z
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                            const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D tensor contract z
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract z
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
+                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
+                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract add x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
+                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D pack/unpack quadrature values
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
+  const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
+  }
+}
+
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
+  const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
+    if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate to quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                       CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
+    ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                     const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D quadrature weights
+//------------------------------------------------------------------------------
+template <int P_1D, int Q_1D>
+inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+  const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
+
+  *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index f2fde94139..dc05f100ae 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA shared memory tensor product basis templates
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // 1D
@@ -19,6 +18,7 @@
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x] = *U;
   __syncthreads();
   *V = 0.0;
@@ -27,7 +27,6 @@ inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, c
       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -35,6 +34,7 @@ inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x] = *U;
   __syncthreads();
   *V = 0.0;
@@ -43,56 +43,77 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca
       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
 // 1D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
+    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 1D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                          CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
+    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
   }
 }
 
 //------------------------------------------------------------------------------
 // 1D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                               CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
+    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 1D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                        CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
+    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 1D quadrature weights
 //------------------------------------------------------------------------------
-template <int Q_1D>
+template <int P_1D, int Q_1D>
 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
 }
@@ -104,8 +125,9 @@ inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restr
 //------------------------------------------------------------------------------
 // 2D tensor contraction x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -114,14 +136,14 @@ inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, c
       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
 // 2D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -130,14 +152,14 @@ inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, c
       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -146,14 +168,14 @@ inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedSca
       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -162,14 +184,14 @@ inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedSca
       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract and add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
@@ -177,69 +199,114 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const Ceed
       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                       CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                     CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                     CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0 * NUM_COMP);
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1 * NUM_COMP);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                              CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0 * NUM_COMP, c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp);
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1 * NUM_COMP, c_G, r_t);
-    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                   const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
-template <int Q_1D>
+template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
 }
@@ -251,7 +318,7 @@ inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *_
 //------------------------------------------------------------------------------
 // 3D tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[P_1D];
   for (CeedInt i = 0; i < P_1D; i++) {
@@ -259,6 +326,7 @@ inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, c
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -267,14 +335,13 @@ inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, c
         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
       }
     }
-    __syncthreads();
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[P_1D];
   for (CeedInt i = 0; i < P_1D; i++) {
@@ -282,6 +349,7 @@ inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, c
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -290,14 +358,13 @@ inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, c
         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
       }
     }
-    __syncthreads();
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   for (CeedInt k = 0; k < Q_1D; k++) {
     V[k] = 0.0;
@@ -312,7 +379,7 @@ inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   for (CeedInt k = 0; k < P_1D; k++) {
     V[k] = 0.0;
@@ -327,7 +394,7 @@ inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -335,6 +402,7 @@ inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedSca
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -343,14 +411,13 @@ inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedSca
         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
       }
     }
-    __syncthreads();
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -358,6 +425,7 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const Ceed
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
@@ -365,14 +433,13 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const Ceed
         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
       }
     }
-    __syncthreads();
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -380,6 +447,7 @@ inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedSca
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -388,14 +456,13 @@ inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedSca
         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
       }
     }
-    __syncthreads();
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -403,6 +470,7 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const Ceed
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
@@ -410,122 +478,173 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const Ceed
         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
       }
     }
-    __syncthreads();
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                       CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D, c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                     CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < Q_1D; i++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_V[i + comp * Q_1D] = r_U[i + comp * P_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < Q_1D; i++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_V[i + comp * P_1D] = r_U[i + comp * Q_1D];
+    }
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                     CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_G, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                              CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * P_1D);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                               CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
-    ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
-    ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
-    ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
+    ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                   const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
+    ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D quadrature weights
 //------------------------------------------------------------------------------
-template <int Q_1D>
+template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index d6039d3a33..ae1cdfc5c7 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,14 +7,13 @@
 
 /// @file
 /// Internal header for CUDA shared memory tensor product basis
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "cuda-shared-basis-read-write-templates.h"
 #include "cuda-shared-basis-tensor-templates.h"
 
 //------------------------------------------------------------------------------
-// Interp kernel by dim
+// Interp kernels by dim
 //------------------------------------------------------------------------------
 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
@@ -24,30 +23,66 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
   }
 }
 
+extern "C" __global__ void InterpCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                            CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
+    }
+  }
+}
+
 extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
                                            CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
@@ -57,30 +92,135 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
   }
 }
 
+extern "C" __global__ void InterpCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                                     CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    }
+  }
+}
+
+extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                              CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __global__ void InterpCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                                        CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Grad kernel by dim
 //------------------------------------------------------------------------------
@@ -93,26 +233,74 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, c
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
                                                                     d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
-      else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __global__ void GradCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                          CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load grad_1d into shared memory
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
+                                                                    d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
@@ -128,32 +316,163 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __global__ void GradCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G,
+                                                   const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
+  // load grad_1d into shared memory
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
                                                                    r_U);
-      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
-      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, c_G, r_V);
+      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
   }
 }
 
+extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                            CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __global__ void GradCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G,
+                                                      const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // load grad_1d into shared memory
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Weight kernels by dim
 //------------------------------------------------------------------------------
@@ -165,19 +484,20 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1];
 
+  // Apply basis element by element
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
-      Weight1d<BASIS_Q_1D>(data, q_weight_1d, r_W);
+      Weight1d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
       WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W);
     } else if (BASIS_DIM == 2) {
-      WeightTensor2d<BASIS_Q_1D>(data, q_weight_1d, r_W);
+      WeightTensor2d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
       WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W);
     } else if (BASIS_DIM == 3) {
-      WeightTensor3d<BASIS_Q_1D>(data, q_weight_1d, r_W);
+      WeightTensor3d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
       WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W,
                                            d_W);
     }
diff --git a/include/ceed/jit-source/cuda/cuda-types.h b/include/ceed/jit-source/cuda/cuda-types.h
index 9863caa7e0..58b2961246 100644
--- a/include/ceed/jit-source/cuda/cuda-types.h
+++ b/include/ceed/jit-source/cuda/cuda-types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for CUDA type definitions
-#ifndef CEED_CUDA_TYPES_H
-#define CEED_CUDA_TYPES_H
+#pragma once
 
 #include <ceed/types.h>
 
@@ -24,6 +23,13 @@ typedef struct {
   CeedInt *outputs[CEED_CUDA_NUMBER_FIELDS];
 } FieldsInt_Cuda;
 
+typedef struct {
+  CeedInt           num_elem;
+  const CeedInt    *num_per_elem;
+  const CeedInt    *indices;
+  const CeedScalar *coords;
+} Points_Cuda;
+
 typedef struct {
   CeedInt     t_id_x;
   CeedInt     t_id_y;
@@ -31,5 +37,3 @@ typedef struct {
   CeedInt     t_id;
   CeedScalar *slice;
 } SharedData_Cuda;
-
-#endif  // CEED_CUDA_TYPES_H
diff --git a/include/ceed/jit-source/gallery/ceed-identity-to-scalar.h b/include/ceed/jit-source/gallery/ceed-identity-to-scalar.h
new file mode 100644
index 0000000000..5cf406fe51
--- /dev/null
+++ b/include/ceed/jit-source/gallery/ceed-identity-to-scalar.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/**
+  @brief  Identity QFunction that copies first input component directly into output
+**/
+#include <ceed/types.h>
+
+CEED_QFUNCTION(IdentityScalar)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // in[0] is input, size (Q*size)
+  const CeedScalar *input = in[0];
+  // out[0] is output, size (Q)
+  CeedScalar *output = out[0];
+
+  // Quadrature point loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { output[i] = input[i]; }  // End of Quadrature Point Loop
+  return CEED_ERROR_SUCCESS;
+}
diff --git a/include/ceed/jit-source/gallery/ceed-identity.h b/include/ceed/jit-source/gallery/ceed-identity.h
index 1a84718f4a..110083b372 100644
--- a/include/ceed/jit-source/gallery/ceed-identity.h
+++ b/include/ceed/jit-source/gallery/ceed-identity.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief  Identity QFunction that copies inputs directly into outputs
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 typedef struct {
   CeedInt size;
@@ -27,6 +26,5 @@ CEED_QFUNCTION(Identity)(void *ctx, const CeedInt Q, const CeedScalar *const *in
 
   // Quadrature point loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q * size; i++) { output[i] = input[i]; }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
index c266beff64..d9a985a56d 100644
--- a/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass1dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for building the geometric data for the 1D mass matrix
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Mass1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is Jacobians, size (Q)
@@ -20,6 +19,5 @@ CEED_QFUNCTION(Mass1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const
 
   // Quadrature point loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[i] * w[i]; }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
index 7e5f6fbd34..4a6946ebce 100644
--- a/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass2dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for building the geometric data for the 2D mass matrix
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Mass2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is Jacobians with shape [2, nc=2, Q]
@@ -22,6 +21,5 @@ CEED_QFUNCTION(Mass2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
     q_data[i] = (J[0][0][i] * J[1][1][i] - J[0][1][i] * J[1][0][i]) * w[i];
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
index 71dc961215..1d7f094dba 100644
--- a/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-mass3dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for building the geometric data for the 3D mass matrix
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Mass3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is Jacobians with shape [2, nc=3, Q]
@@ -24,6 +23,5 @@ CEED_QFUNCTION(Mass3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const
                  J[0][2][i] * (J[1][0][i] * J[2][1][i] - J[1][1][i] * J[2][0][i])) *
                 w[i];
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-massapply.h b/include/ceed/jit-source/gallery/ceed-massapply.h
index 8559ce8a26..41a0695e39 100644
--- a/include/ceed/jit-source/gallery/ceed-massapply.h
+++ b/include/ceed/jit-source/gallery/ceed-massapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for applying the mass matrix
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is u, size (Q)
@@ -20,6 +19,5 @@ CEED_QFUNCTION(MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *i
 
   // Quadrature point loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = u[i] * q_data[i]; }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
index dc38d4f21a..d23f134eb0 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson1dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for applying the 1D Poisson operator
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u, size (Q)
@@ -21,6 +20,5 @@ CEED_QFUNCTION(Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *con
 
   // Quadrature point loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { vg[i] = ug[i] * q_data[i]; }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
index dce08aabb2..b84fa01d31 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson1dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for building the geometric data for the 1D Poisson operator
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
@@ -24,6 +23,5 @@ CEED_QFUNCTION(Poisson1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *con
 
   // Quadrature point loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = w[i] / J[i]; }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
index dab64be671..62329ad1a4 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson2dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for applying the 2D Poisson operator
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u, shape [2, nc=1, Q]
@@ -35,6 +34,5 @@ CEED_QFUNCTION(Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *con
     // j = direction of vg
     for (CeedInt j = 0; j < dim; j++) vg[j][i] = (ug[0][i] * dXdxdXdxT[0][j] + ug[1][i] * dXdxdXdxT[1][j]);
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
index 11e15255ad..8546c304cd 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson2dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for building the geometric data for the 2D Poisson operator
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store
@@ -30,10 +29,10 @@ CEED_QFUNCTION(Poisson2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *con
     const CeedScalar J01 = J[1][0][i];
     const CeedScalar J11 = J[1][1][i];
     const CeedScalar qw  = w[i] / (J00 * J11 - J10 * J01);
-    q_data[0][i]         = qw * (J01 * J01 + J11 * J11);
-    q_data[1][i]         = qw * (J00 * J00 + J10 * J10);
-    q_data[2][i]         = -qw * (J00 * J01 + J10 * J11);
-  }  // End of Quadrature Point Loop
 
+    q_data[0][i] = qw * (J01 * J01 + J11 * J11);
+    q_data[1][i] = qw * (J00 * J00 + J10 * J10);
+    q_data[2][i] = -qw * (J00 * J01 + J10 * J11);
+  }  // End of Quadrature Point Loop
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
index 71e76926e7..77295c9fb8 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson3dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for applying the geometric data for the 3D Poisson operator
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u, shape [3, nc=1, Q]
@@ -37,6 +36,5 @@ CEED_QFUNCTION(Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *con
     // j = direction of vg
     for (CeedInt j = 0; j < dim; j++) vg[j][i] = (ug[0][i] * dXdxdXdxT[0][j] + ug[1][i] * dXdxdXdxT[1][j] + ug[2][i] * dXdxdXdxT[2][j]);
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
index 2d4e0621e4..b42bbb93f9 100644
--- a/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
+++ b/include/ceed/jit-source/gallery/ceed-poisson3dbuild.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for building the geometric data for the 3D Poisson operator
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Poisson3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store the symmetric part of the result.
@@ -47,6 +46,5 @@ CEED_QFUNCTION(Poisson3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *con
     q_data[4][i] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]);
     q_data[5][i] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]);
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-scale-scalar.h b/include/ceed/jit-source/gallery/ceed-scale-scalar.h
new file mode 100644
index 0000000000..f70c62ec9a
--- /dev/null
+++ b/include/ceed/jit-source/gallery/ceed-scale-scalar.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/**
+  @brief  Scaling QFunction that scales inputs
+**/
+#include <ceed/types.h>
+
+CEED_QFUNCTION(ScaleScalar)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // Ctx holds field size
+  const CeedInt size = *(CeedInt *)ctx;
+
+  // in[0] is input, size (Q*size)
+  // in[1] is scaling factor, size (Q*size)
+  const CeedScalar *input = in[0];
+  const CeedScalar *scale = in[1];
+  // out[0] is output, size (Q*size)
+  CeedScalar *output = out[0];
+
+  // Quadrature point loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) {
+    for (CeedInt j = 0; j < size; j++) output[i + j * Q] = input[i + j * Q] * scale[i];
+  }  // End of Quadrature Point Loop
+  return CEED_ERROR_SUCCESS;
+}
diff --git a/include/ceed/jit-source/gallery/ceed-scale.h b/include/ceed/jit-source/gallery/ceed-scale.h
index 1249810987..6c0157f7e2 100644
--- a/include/ceed/jit-source/gallery/ceed-scale.h
+++ b/include/ceed/jit-source/gallery/ceed-scale.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief  Scaling QFunction that scales inputs
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // Ctx holds field size
@@ -24,5 +23,5 @@ CEED_QFUNCTION(Scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, C
 
   // Quadrature point loop
   CeedPragmaSIMD for (CeedInt i = 0; i < Q * size; i++) { output[i] = input[i] * scale[i]; }  // End of Quadrature Point Loop
-  return 0;
+  return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-vectormassapply.h b/include/ceed/jit-source/gallery/ceed-vectormassapply.h
index 70a2f3e25c..adc67918f6 100644
--- a/include/ceed/jit-source/gallery/ceed-vectormassapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectormassapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for applying the mass matrix on a vector system with three components
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Vector3MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is u, size (Q)
@@ -26,6 +25,5 @@ CEED_QFUNCTION(Vector3MassApply)(void *ctx, const CeedInt Q, const CeedScalar *c
       v[c][i] = u[c][i] * q_data[i];
     }
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
index e056729422..8921c348ae 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson1dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for applying the 1D Poisson operator on a vector system with three components
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Vector3Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u, shape [1, nc=3, Q]
@@ -26,6 +25,5 @@ CEED_QFUNCTION(Vector3Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScal
       vg[c][i] = ug[c][i] * q_data[i];
     }
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
index 1b56240048..12f7d73468 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson2dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for applying the 2D Poisson operator on a vector system with three components
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Vector3Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u, shape [2, nc=3, Q]
@@ -36,6 +35,5 @@ CEED_QFUNCTION(Vector3Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScal
     for (CeedInt j = 0; j < dim; j++)
       for (CeedInt c = 0; c < num_comp; c++) vg[j][c][i] = (ug[0][c][i] * dXdxdXdxT[0][j] + ug[1][c][i] * dXdxdXdxT[1][j]);
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
index 9ca86dba01..634ecb01a5 100644
--- a/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
+++ b/include/ceed/jit-source/gallery/ceed-vectorpoisson3dapply.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,8 +8,7 @@
 /**
   @brief Ceed QFunction for applying the geometric data for the 3D Poisson on a vector system with three components operator
 **/
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(Vector3Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u, shape [3, nc=3, Q]
@@ -39,6 +38,5 @@ CEED_QFUNCTION(Vector3Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScal
       for (CeedInt c = 0; c < num_comp; c++)
         vg[j][c][i] = (ug[0][c][i] * dXdxdXdxT[0][j] + ug[1][c][i] * dXdxdXdxT[1][j] + ug[2][c][i] * dXdxdXdxT[2][j]);
   }  // End of Quadrature Point Loop
-
   return CEED_ERROR_SUCCESS;
 }
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index 812e901866..0064ec66e3 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,30 +7,74 @@
 
 /// @file
 /// Internal header for HIP backend macro and type definitions for JiT source
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Load matrices for basis actions
 //------------------------------------------------------------------------------
 template <int P, int Q>
-inline __device__ void loadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
+inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
 }
 
+//------------------------------------------------------------------------------
+// AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// L-vector -> single point
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
+inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+  const CeedInt ind = indices[p + elem * NUM_PTS];
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_u[comp] = d_u[ind + comp * COMP_STRIDE];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Single point -> L-vector
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
+inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
+  if (p < points_in_elem) {
+    const CeedInt ind = indices[p + elem * NUM_PTS];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_u[ind + comp * COMP_STRIDE] += r_u[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 1D
 //------------------------------------------------------------------------------
 
+//------------------------------------------------------------------------------
+// Set E-vector value
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D>
+inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+  const CeedInt target_comp = n / P_1D;
+  const CeedInt target_node = n % P_1D;
+
+  if (data.t_id_x == target_node) {
+    r_v[target_comp] = value;
+  }
+}
+
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void readDofsOffset1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                        const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d) {
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
-    const CeedInt ind  = indices[node + elem * P_1d];
+    const CeedInt ind  = indices[node + elem * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
   }
@@ -39,9 +83,9 @@ inline __device__ void readDofsOffset1d(SharedData_Hip &data, const CeedInt num_
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readDofsStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
@@ -52,24 +96,72 @@ inline __device__ void readDofsStrided1d(SharedData_Hip &data, const CeedInt ele
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void writeDofsOffset1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                         const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d) {
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
-    const CeedInt ind  = indices[node + elem * P_1d];
+    const CeedInt ind  = indices[node + elem * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
   }
 }
 
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
+                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
+                                                  CeedScalar *__restrict__ d_v) {
+  const CeedInt target_comp = n / P_1D;
+  const CeedInt target_node = n % P_1D;
+
+  if (data.t_id_x == target_node) {
+    const CeedInt ind = indices[target_node + elem * P_1D];
+
+    atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, full assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
+                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  const CeedInt in_comp    = in / P_1D;
+  const CeedInt in_node    = in % P_1D;
+  const CeedInt e_vec_size = P_1D * NUM_COMP;
+
+  if (data.t_id_x < P_1D) {
+    const CeedInt out_node = data.t_id_x;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, Qfunction assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
+                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < Q_1D) {
+    const CeedInt ind = data.t_id_x + elem * Q_1D;
+
+    for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
+      d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void writeDofsStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d) {
+  if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
@@ -81,15 +173,29 @@ inline __device__ void writeDofsStrided1d(SharedData_Hip &data, const CeedInt el
 // 2D
 //------------------------------------------------------------------------------
 
+//------------------------------------------------------------------------------
+// Set E-vector value
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D>
+inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    r_v[target_comp] = value;
+  }
+}
+
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void readDofsOffset2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                        const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
-    const CeedInt ind  = indices[node + elem * P_1d * P_1d];
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
   }
@@ -98,10 +204,10 @@ inline __device__ void readDofsOffset2d(SharedData_Hip &data, const CeedInt num_
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readDofsStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
@@ -111,25 +217,80 @@ inline __device__ void readDofsStrided2d(SharedData_Hip &data, const CeedInt ele
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void writeDofsOffset2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                         const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
-    const CeedInt ind  = indices[node + elem * P_1d * P_1d];
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
   }
 }
 
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
+                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
+                                                  CeedScalar *__restrict__ d_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
+
+    atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, full assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
+                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  const CeedInt elem_size  = P_1D * P_1D;
+  const CeedInt in_comp    = in / elem_size;
+  const CeedInt in_node_x  = in % P_1D;
+  const CeedInt in_node_y  = (in % elem_size) / P_1D;
+  const CeedInt e_vec_size = elem_size * NUM_COMP;
+
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt in_node  = in_node_x + in_node_y * P_1D;
+    const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
+
+      d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, Qfunction assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
+                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D;
+
+    for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
+      d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void writeDofsStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
@@ -140,51 +301,62 @@ inline __device__ void writeDofsStrided2d(SharedData_Hip &data, const CeedInt el
 // 3D
 //------------------------------------------------------------------------------
 
+//------------------------------------------------------------------------------
+// Set E-vector value
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D>
+inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
+  const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    r_v[target_node_z + target_comp * P_1D] = value;
+  }
+}
+
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-// TODO: remove "Dofs" and "Quads" in the following function names?
-//   - readDofsOffset3d -> readOffset3d ?
-//   - readDofsStrided3d -> readStrided3d ?
-//   - readSliceQuadsOffset3d -> readSliceOffset3d ?
-//   - readSliceQuadsStrided3d -> readSliceStrided3d ?
-//   - writeDofsOffset3d -> writeOffset3d ?
-//   - writeDofsStrided3d -> writeStrided3d ?
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void readDofsOffset3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                        const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
-      const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
-
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
     }
+  }
 }
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readDofsStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
     }
+  }
 }
 
 //------------------------------------------------------------------------------
 // E-vector -> Q-vector, offests provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
-inline __device__ void readSliceQuadsOffset3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
-                                              const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
-    const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
+template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
+inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
+                                               const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
+                                               CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
+    const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
   }
@@ -193,11 +365,11 @@ inline __device__ void readSliceQuadsOffset3d(SharedData_Hip &data, const CeedIn
 //------------------------------------------------------------------------------
 // E-vector -> Q-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void readSliceQuadsStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
-                                               CeedScalar *__restrict__ r_u) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
-    const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
+template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
+                                              CeedScalar *__restrict__ r_u) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
@@ -207,55 +379,122 @@ inline __device__ void readSliceQuadsStrided3d(SharedData_Hip &data, const CeedI
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1d>
-inline __device__ void writeDofsOffset3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                         const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
-      const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
+                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
+    }
+  }
+}
+
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
+                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
+                                                  CeedScalar *__restrict__ d_v) {
+  const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
+  const CeedInt target_node_x = n % P_1D;
+  const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
+  const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
+
+  if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
+
+    atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, full assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
+                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  const CeedInt elem_size  = P_1D * P_1D * P_1D;
+  const CeedInt in_comp    = in / elem_size;
+  const CeedInt in_node_x  = in % P_1D;
+  const CeedInt in_node_y  = (in % (P_1D * P_1D)) / P_1D;
+  const CeedInt in_node_z  = (in % elem_size) / (P_1D * P_1D);
+  const CeedInt e_vec_size = elem_size * NUM_COMP;
+
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+        const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
+
+        d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, Qfunction assembly
+//------------------------------------------------------------------------------
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
+                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
+    for (CeedInt z = 0; z < Q_1D; z++) {
+      const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D;
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
+      for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
+        d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D];
+      }
     }
+  }
 }
 
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void writeDofsStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
+inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
                                           CeedScalar *__restrict__ d_v) {
-  if (data.t_id_x < P_1d && data.t_id_y < P_1d)
-    for (CeedInt z = 0; z < P_1d; z++) {
-      const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
 
-      for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
     }
+  }
 }
 
 //------------------------------------------------------------------------------
 // 3D collocated derivatives computation
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1d>
-inline __device__ void gradCollo3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                   CeedScalar *__restrict__ r_V) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                        CeedScalar *__restrict__ r_V) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
+      __syncthreads();
+      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
       __syncthreads();
       // X derivative
       r_V[comp + 0 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
+      }
       // Y derivative
       r_V[comp + 1 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
+      }
       // Z derivative
       r_V[comp + 2 * NUM_COMP] = 0.0;
-      for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];  // Contract z direction (Z derivative)
-      __syncthreads();
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
+      }
     }
   }
 }
@@ -263,26 +502,29 @@ inline __device__ void gradCollo3d(SharedData_Hip &data, const CeedInt q, const
 //------------------------------------------------------------------------------
 // 3D collocated derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1d>
-inline __device__ void gradColloTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                            CeedScalar *__restrict__ r_V) {
-  if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                                 CeedScalar *__restrict__ r_V) {
+  if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // X derivative
-      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
       __syncthreads();
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
+      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
       __syncthreads();
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
+      }
       // Y derivative
-      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
       __syncthreads();
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
+      data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
       __syncthreads();
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
+      }
       // Z derivative
-      for (CeedInt i = 0; i < Q_1d; i++)
-        r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];  // PARTIAL contract z direction (Z derivative)
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
+      }
     }
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-jit.h b/include/ceed/jit-source/hip/hip-jit.h
index 2ac1968b2d..032d716828 100644
--- a/include/ceed/jit-source/hip/hip-jit.h
+++ b/include/ceed/jit-source/hip/hip-jit.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
index 00b559ff10..c6b951b87a 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP non-tensor product basis templates
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Tensor contraction
@@ -24,7 +23,7 @@ inline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     // Run with Q threads
-    U = d_U + elem * strides_elem_U + comp * strides_comp_U;
+    U = &d_U[elem * strides_elem_U + comp * strides_comp_U];
     for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0;
     for (CeedInt i = 0; i < P; i++) {
       const CeedScalar val = U[i];
@@ -53,9 +52,9 @@ inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strid
     // Run with P threads
     r_V = 0.0;
     for (CeedInt d = 0; d < Q_COMP; d++) {
-      U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U;
+      U = &d_U[elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U];
       for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i];
     }
-    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V;
+    d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V;
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
index 953f6f48e3..71074a35dc 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP non-tensor product basis
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "hip-ref-basis-nontensor-templates.h"
 
@@ -21,18 +20,32 @@
 //------------------------------------------------------------------------------
 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U,
                                   CeedScalar *__restrict__ d_V) {
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem)
+    Contract<BASIS_NUM_COMP, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q>(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem,
+                                                                    BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     Contract<BASIS_NUM_COMP, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q>(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem,
                                                                     BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
   }
+#endif
 }
 
 extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U,
                                            CeedScalar *__restrict__ d_V) {
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem)
+    ContractTranspose<BASIS_NUM_COMP, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q>(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem,
+                                                                             BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ContractTranspose<BASIS_NUM_COMP, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q>(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem,
                                                                              BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
   }
+#endif
 }
 
 //------------------------------------------------------------------------------
@@ -40,18 +53,32 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
 //------------------------------------------------------------------------------
 extern "C" __global__ void Deriv(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U,
                                  CeedScalar *__restrict__ d_V) {
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem)
+    Contract<BASIS_NUM_COMP, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q>(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem,
+                                                                   BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     Contract<BASIS_NUM_COMP, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q>(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem,
                                                                    BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
   }
+#endif
 }
 
 extern "C" __global__ void DerivTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U,
                                           CeedScalar *__restrict__ d_V) {
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem)
+    ContractTranspose<BASIS_NUM_COMP, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q>(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem,
+                                                                            BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     ContractTranspose<BASIS_NUM_COMP, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q>(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem,
                                                                             BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V);
   }
+#endif
 }
 
 //------------------------------------------------------------------------------
@@ -61,7 +88,13 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re
   const CeedInt t_id = threadIdx.x;
   // TODO load q_weight in shared memory if blockDim.z > 1?
 
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem)
+    d_V[elem * BASIS_Q + t_id] = q_weight[t_id];
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     d_V[elem * BASIS_Q + t_id] = q_weight[t_id];
   }
+#endif
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
new file mode 100644
index 0000000000..61ef0d3f0a
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -0,0 +1,408 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA tensor product basis with AtPoints evaluation
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// Chebyshev values
+//------------------------------------------------------------------------------
+template <int Q_1D>
+inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
+  chebyshev_x[0] = 1.0;
+  chebyshev_x[1] = 2 * x;
+  for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
+}
+
+template <int Q_1D>
+inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
+  CeedScalar chebyshev_x[3];
+
+  chebyshev_x[1]  = 1.0;
+  chebyshev_x[2]  = 2 * x;
+  chebyshev_dx[0] = 0.0;
+  chebyshev_dx[1] = 2.0;
+  for (CeedInt i = 2; i < Q_1D; i++) {
+    chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
+    chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Tensor Basis Kernels AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interp
+//------------------------------------------------------------------------------
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                          const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                          const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_NODES;
+  const CeedInt v_stride      = BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt u_size        = BASIS_NUM_NODES;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt           pre   = u_size;
+      CeedInt           post  = 1;
+
+      // Map to coefficients
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        __syncthreads();
+        // Update buffers used
+        pre /= P;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * Q;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+          out[k] = v_k;
+        }
+        post *= Q;
+      }
+
+      // Map to point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        pre  = BASIS_NUM_QPTS;
+        post = 1;
+        for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          // Update buffers used
+          pre /= Q;
+          const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
+          CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
+
+          // Build Chebyshev polynomial values
+          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
+
+          // Contract along middle index
+          for (CeedInt a = 0; a < pre; a++) {
+            for (CeedInt c = 0; c < post; c++) {
+              CeedScalar v_k = 0;
+
+              for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
+              out[a * post + c] = v_k;
+            }
+          }
+          post *= 1;
+        }
+      }
+    }
+  }
+}
+
+extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_PTS;
+  const CeedInt v_stride      = BASIS_NUM_NODES;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt u_size        = BASIS_NUM_PTS;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt           pre   = 1;
+      CeedInt           post  = 1;
+
+      // Clear Chebyshev coeffs
+      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+        s_chebyshev_coeffs[k] = 0.0;
+      }
+
+      // Map from point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        if (p >= points_per_elem[elem]) continue;
+        pre  = 1;
+        post = 1;
+        for (CeedInt d = 0; d < BASIS_DIM; d++) {
+          // Update buffers used
+          pre /= 1;
+          const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
+          CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
+
+          // Build Chebyshev polynomial values
+          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
+
+          // Contract along middle index
+          for (CeedInt a = 0; a < pre; a++) {
+            for (CeedInt c = 0; c < post; c++) {
+              if (d == BASIS_DIM - 1) {
+                for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
+              } else {
+                for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
+              }
+            }
+          }
+          post *= Q;
+        }
+      }
+
+      // Map from coefficients
+      pre  = BASIS_NUM_QPTS;
+      post = 1;
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        __syncthreads();
+        // Update buffers used
+        pre /= Q;
+        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * P;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % P;
+          const CeedInt a   = k / (post * P);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+          if (d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
+        }
+        post *= P;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_NODES;
+  const CeedInt v_stride      = BASIS_NUM_PTS;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt u_size        = BASIS_NUM_NODES;
+  const CeedInt u_dim_stride  = 0;
+  const CeedInt v_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedInt           pre   = u_size;
+      CeedInt           post  = 1;
+
+      // Map to coefficients
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        __syncthreads();
+        // Update buffers used
+        pre /= P;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * Q;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
+          out[k] = v_k;
+        }
+        post *= Q;
+      }
+
+      // Map to point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+          CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
+
+          pre  = BASIS_NUM_QPTS;
+          post = 1;
+          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
+            // Update buffers used
+            pre /= Q;
+            const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
+
+            // Build Chebyshev polynomial values
+            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
+            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
+
+            // Contract along middle index
+            for (CeedInt a = 0; a < pre; a++) {
+              for (CeedInt c = 0; c < post; c++) {
+                CeedScalar v_k = 0;
+
+                for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
+                out[a * post + c] = v_k;
+              }
+            }
+            post *= 1;
+          }
+        }
+      }
+    }
+  }
+}
+
+extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
+                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                                 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  const CeedInt i = threadIdx.x;
+
+  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
+  CeedScalar           *s_chebyshev_interp_1d = s_mem;
+  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
+  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
+  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
+  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
+  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
+    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
+  }
+
+  const CeedInt P             = BASIS_P_1D;
+  const CeedInt Q             = BASIS_Q_1D;
+  const CeedInt u_stride      = BASIS_NUM_PTS;
+  const CeedInt v_stride      = BASIS_NUM_NODES;
+  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
+  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
+  const CeedInt u_size        = BASIS_NUM_PTS;
+  const CeedInt u_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
+  const CeedInt v_dim_stride  = 0;
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
+    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
+      CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
+      CeedInt     pre   = 1;
+      CeedInt     post  = 1;
+
+      // Clear Chebyshev coeffs
+      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
+        s_chebyshev_coeffs[k] = 0.0;
+      }
+
+      // Map from point
+      __syncthreads();
+      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+        if (p >= points_per_elem[elem]) continue;
+        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
+          const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
+
+          pre  = 1;
+          post = 1;
+          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
+            // Update buffers used
+            pre /= 1;
+            const CeedScalar *in  = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
+            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
+
+            // Build Chebyshev polynomial values
+            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
+
+            // Contract along middle index
+            for (CeedInt a = 0; a < pre; a++) {
+              for (CeedInt c = 0; c < post; c++) {
+                if (dim_2 == BASIS_DIM - 1) {
+                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
+                } else {
+                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
+                }
+              }
+            }
+            post *= Q;
+          }
+        }
+      }
+
+      // Map from coefficients
+      pre  = BASIS_NUM_QPTS;
+      post = 1;
+      for (CeedInt d = 0; d < BASIS_DIM; d++) {
+        __syncthreads();
+        // Update buffers used
+        pre /= Q;
+        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
+        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
+        const CeedInt     writeLen = pre * post * P;
+
+        // Contract along middle index
+        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % P;
+          const CeedInt a   = k / (post * P);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
+          if (d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
+        }
+        post *= P;
+      }
+    }
+  }
+}
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
index efbb06548b..1455b5ac21 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP tensor product basis
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Tensor Basis Kernels
@@ -17,7 +16,7 @@
 //------------------------------------------------------------------------------
 // Interp
 //------------------------------------------------------------------------------
-extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d,
+extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ interp_1d,
                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
@@ -29,44 +28,42 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos
     s_interp_1d[k] = interp_1d[k];
   }
 
-  const CeedInt P             = transpose ? BASIS_Q_1D : BASIS_P_1D;
-  const CeedInt Q             = transpose ? BASIS_P_1D : BASIS_Q_1D;
-  const CeedInt stride_0      = transpose ? 1 : BASIS_P_1D;
-  const CeedInt stride_1      = transpose ? BASIS_P_1D : 1;
-  const CeedInt u_stride      = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
-  const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
-  const CeedInt u_size        = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+  const CeedInt P             = is_transpose ? BASIS_Q_1D : BASIS_P_1D;
+  const CeedInt Q             = is_transpose ? BASIS_P_1D : BASIS_Q_1D;
+  const CeedInt stride_0      = is_transpose ? 1 : BASIS_P_1D;
+  const CeedInt stride_1      = is_transpose ? BASIS_P_1D : 1;
+  const CeedInt u_stride      = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
+  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
+  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
+  const CeedInt u_size        = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
-      const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
-      CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
+      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
+      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
       CeedInt           pre   = u_size;
       CeedInt           post  = 1;
 
-      for (CeedInt k = i; k < u_size; k += blockDim.x) {
-        s_buffer_1[k] = cur_u[k];
-      }
       for (CeedInt d = 0; d < BASIS_DIM; d++) {
         __syncthreads();
         // Update buffers used
         pre /= P;
-        const CeedScalar *in       = d % 2 ? s_buffer_2 : s_buffer_1;
+        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
         CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
         const CeedInt     writeLen = pre * post * Q;
 
         // Contract along middle index
         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
-          const CeedInt c  = k % post;
-          const CeedInt j  = (k / post) % Q;
-          const CeedInt a  = k / (post * Q);
-          CeedScalar    vk = 0;
-
-          for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
-          out[k] = vk;
+          const CeedInt c   = k % post;
+          const CeedInt j   = (k / post) % Q;
+          const CeedInt a   = k / (post * Q);
+          CeedScalar    v_k = 0;
+
+          for (CeedInt b = 0; b < P; b++) v_k += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
+          if (is_transpose && d == BASIS_DIM - 1) out[k] += v_k;
+          else out[k] = v_k;
         }
         post *= Q;
       }
@@ -77,7 +74,7 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos
 //------------------------------------------------------------------------------
 // Grad
 //------------------------------------------------------------------------------
-extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d,
+extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ interp_1d,
                                 const CeedScalar *__restrict__ grad_1d, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
@@ -91,26 +88,26 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose,
     s_grad_1d[k]   = grad_1d[k];
   }
 
-  const CeedInt P             = transpose ? BASIS_Q_1D : BASIS_P_1D;
-  const CeedInt Q             = transpose ? BASIS_P_1D : BASIS_Q_1D;
-  const CeedInt stride_0      = transpose ? 1 : BASIS_P_1D;
-  const CeedInt stride_1      = transpose ? BASIS_P_1D : 1;
-  const CeedInt u_stride      = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
-  const CeedInt v_stride      = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
-  const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
-  const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
-  const CeedInt u_dim_stride  = transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0;
-  const CeedInt v_dim_stride  = transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP;
+  const CeedInt P             = is_transpose ? BASIS_Q_1D : BASIS_P_1D;
+  const CeedInt Q             = is_transpose ? BASIS_P_1D : BASIS_Q_1D;
+  const CeedInt stride_0      = is_transpose ? 1 : BASIS_P_1D;
+  const CeedInt stride_1      = is_transpose ? BASIS_P_1D : 1;
+  const CeedInt u_stride      = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+  const CeedInt v_stride      = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS;
+  const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES);
+  const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS);
+  const CeedInt u_dim_stride  = is_transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0;
+  const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP;
 
   // Apply basis element by element
   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
       // dim*dim contractions for grad
       for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
-        CeedInt           pre   = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
+        CeedInt           pre   = is_transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES;
         CeedInt           post  = 1;
-        const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
-        CeedScalar       *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
+        const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
+        CeedScalar       *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
 
         for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
           __syncthreads();
@@ -129,7 +126,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose,
             CeedScalar    v_k = 0;
 
             for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
-            if (transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k;
+            if (is_transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k;
             else out[k] = v_k;
           }
           post *= Q;
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
index e6a8b6e6a1..581545f71a 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP operator diagonal assembly
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 #if USE_CEEDSIZE
 typedef CeedSize IndexType;
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
index bf86921066..a235c8be7a 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP operator full assembly
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 #if USE_CEEDSIZE
 typedef CeedSize IndexType;
@@ -24,7 +23,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
                         const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out,
                         const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) {
   extern __shared__ CeedScalar s_CT[];
-  CeedScalar                  *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN;
+  CeedScalar                  *s_C = &s_CT[NUM_NODES_OUT * NUM_NODES_IN];
 
   const int l = threadIdx.x;  // The output column index of each B^T D B operation
                               // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
@@ -62,7 +61,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
                 result += B_out[b_out_index + j * NUM_NODES_OUT + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES_IN + l];
               }
             }  // end of out eval mode
-          }    // end of in eval mode
+          }  // end of in eval mode
           if (orients_in) {
             result *= orients_in[NUM_NODES_IN * e + l] ? -1.0 : 1.0;
           }
@@ -101,6 +100,6 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
           }
         }
       }  // end of out component
-    }    // end of in component
-  }      // end of element loop
+    }  // end of in component
+  }  // end of element loop
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-qfunction.h b/include/ceed/jit-source/hip/hip-ref-qfunction.h
index 1b423072af..bf605feba4 100644
--- a/include/ceed/jit-source/hip/hip-ref-qfunction.h
+++ b/include/ceed/jit-source/hip/hip-ref-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP backend QFunction read/write kernels
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Read from quadrature points
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
new file mode 100644
index 0000000000..3c88d685a3
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-at-points.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP offset element restriction kernels
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, standard (with offsets)
+//------------------------------------------------------------------------------
+#if !USE_DETERMINISTIC
+extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ indices, const CeedInt *__restrict__ points_per_elem,
+                                             const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt ind      = indices[node];
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    if (loc_node >= points_per_elem[elem]) continue;
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+    }
+  }
+}
+#else
+extern "C" __global__ void AtPointsTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
+                                             const CeedInt *__restrict__ points_per_elem, const CeedInt *__restrict__ t_offsets,
+                                             const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  CeedScalar value[RSTR_NUM_COMP];
+
+  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
+    const CeedInt ind     = l_vec_indices[i];
+    const CeedInt range_1 = t_offsets[i];
+    const CeedInt range_N = t_offsets[i + 1];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
+
+    for (CeedInt j = range_1; j < range_N; j++) {
+      const CeedInt t_ind    = t_indices[j];
+      const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE;
+      const CeedInt elem     = t_ind / RSTR_ELEM_SIZE;
+
+      if (loc_node >= points_per_elem[elem]) continue;
+      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE];
+      }
+    }
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
+  }
+}
+#endif
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
index 4d3e88ce27..ee5544309d 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP curl-oriented element restriction kernels
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, curl-oriented
@@ -80,7 +79,7 @@ extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ ind
       value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
       value +=
           loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
     }
   }
 }
@@ -138,7 +137,7 @@ extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restri
       value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
       value +=
           loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], value);
     }
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
index 26cd41ee92..a3e952b5ca 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP offset element restriction kernels
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, standard (with offsets)
@@ -36,7 +35,7 @@ extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices,
     const CeedInt elem     = node / RSTR_ELEM_SIZE;
 
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE], u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
     }
   }
 }
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
index cb987fa8a7..ffe8890ef2 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP oriented element restriction kernels
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, oriented
@@ -40,7 +39,7 @@ extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices
     const CeedInt elem     = node / RSTR_ELEM_SIZE;
 
     for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE,
+      atomicAdd(&v[ind + comp * RSTR_COMP_STRIDE],
                 u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0));
     }
   }
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
index de1335c117..445aede42d 100644
--- a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP strided element restriction kernels
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
new file mode 100644
index 0000000000..71d183bcf8
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP shared memory non-tensor basis templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// 1D tensor contraction
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void Contract1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (data.t_id_x < Q_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// 1D transpose tensor contraction
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D>
+inline __device__ void ContractTranspose1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  data.slice[data.t_id_x] = *U;
+  __syncthreads();
+  if (data.t_id_x < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// Interpolate to quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P, int Q, int T_1D>
+inline __device__ void InterpNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                       CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P, int Q, int T_1D>
+inline __device__ void InterpTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = 0.0;
+    ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int DIM, int P, int Q, int T_1D>
+inline __device__ void GradNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  for (CeedInt dim = 0; dim < DIM; dim++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int DIM, int P, int Q, int T_1D>
+inline __device__ void GradTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
+                                              CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0;
+  for (CeedInt dim = 0; dim < DIM; dim++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp + dim * NUM_COMP], &c_G[dim * P * Q], &r_V[comp]);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Quadrature weights
+//------------------------------------------------------------------------------
+template <int P, int Q>
+inline __device__ void WeightNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) {
+  *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
new file mode 100644
index 0000000000..175e720a55
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
@@ -0,0 +1,203 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP shared memory non-tensor basis
+#include <ceed/types.h>
+
+#include "hip-shared-basis-nontensor-templates.h"
+#include "hip-shared-basis-read-write-templates.h"
+
+//------------------------------------------------------------------------------
+// Interp kernels
+//------------------------------------------------------------------------------
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load interp into shared memory
+  __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
+    InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load interp into shared memory
+  __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load interp into shared memory
+  __shared__ CeedScalar s_B[BASIS_P * BASIS_Q];
+  LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V);
+    SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad kernels
+//------------------------------------------------------------------------------
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
+
+  // load grad into shared memory
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
+  LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U);
+    GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V);
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load grad into shared memory
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
+  LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
+    WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load grad into shared memory
+  __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
+  LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U);
+    GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V);
+    SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Weight kernel
+//------------------------------------------------------------------------------
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D;
+
+  CeedScalar r_W[1];
+
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    WeightNonTensor<BASIS_P, BASIS_Q>(data, q_weight, r_W);
+    WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);
+  }
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
index a6d945ac56..80be446bee 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,17 +7,14 @@
 
 /// @file
 /// Internal header for HIP shared memory basis read/write templates
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Helper function: load matrices for basis actions
 //------------------------------------------------------------------------------
-template <int SIZE>
-inline __device__ void loadMatrix(const CeedScalar *d_B, CeedScalar *B) {
-  CeedInt tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-
-  for (CeedInt i = tid; i < SIZE; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
+template <int P, int Q>
+inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
+  for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
 }
 
 //------------------------------------------------------------------------------
@@ -56,6 +53,19 @@ inline __device__ void WriteElementStrided1d(SharedData_Hip &data, const CeedInt
   }
 }
 
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D) {
+    const CeedInt node = data.t_id_x;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 2D
 //------------------------------------------------------------------------------
@@ -92,6 +102,19 @@ inline __device__ void WriteElementStrided2d(SharedData_Hip &data, const CeedInt
   }
 }
 
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] += r_v[comp];
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // 3D
 //------------------------------------------------------------------------------
@@ -131,3 +154,57 @@ inline __device__ void WriteElementStrided3d(SharedData_Hip &data, const CeedInt
     }
   }
 }
+
+template <int NUM_COMP, int P_1D>
+inline __device__ void SumElementStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
+                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+  if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
+    for (CeedInt z = 0; z < P_1D; z++) {
+      const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = node * strides_node + elem * strides_elem;
+
+      for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+        d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// E-vector -> single point
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_PTS>
+inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, const CeedInt strides_point,
+                                 const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+  const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
+
+  if (p < points_in_elem) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_u[comp] = d_u[ind + comp * strides_comp];
+    }
+  } else {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_u[comp] = 0.0;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Single point -> E-vector
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_PTS>
+inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
+                                  const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v,
+                                  CeedScalar *d_v) {
+  if (p < points_in_elem) {
+    const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      d_v[ind + comp * strides_comp] = r_v[comp];
+    }
+  }
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
new file mode 100644
index 0000000000..d93ce6c90b
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -0,0 +1,467 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP shared memory tensor product basis AtPoints templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// Chebyshev values
+//------------------------------------------------------------------------------
+template <int Q_1D>
+inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
+  chebyshev_x[0] = 1.0;
+  chebyshev_x[1] = 2 * x;
+  for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
+}
+
+template <int Q_1D>
+inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
+  CeedScalar chebyshev_x[3];
+
+  chebyshev_x[1]  = 1.0;
+  chebyshev_x[2]  = 2 * x;
+  chebyshev_dx[0] = 0.0;
+  chebyshev_dx[1] = 2.0;
+  for (CeedInt i = 2; i < Q_1D; i++) {
+    chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
+    chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 1D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Load coefficients
+    __syncthreads();
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * data.slice[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Clear shared memory
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
+    __syncthreads();
+    // Contract x direction
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
+    // Load coefficients
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * data.slice[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Clear shared memory
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
+    __syncthreads();
+    // Contract x direction
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 2D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Load coefficients
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      buffer[i] = 0.0;
+      for (CeedInt j = 0; j < Q_1D; j++) {
+        buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+      }
+    }
+    // Contract y direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * buffer[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Clear shared memory
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    __syncthreads();
+    // Contract y direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
+
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      buffer[i] = chebyshev_x[i] * r_u;
+    }
+    // Contract x direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      // Note: shifting to avoid atomic adds
+      const CeedInt ii = (i + data.t_id_y) % Q_1D;
+
+      for (CeedInt j = 0; j < Q_1D; j++) {
+        const CeedInt jj = (j + data.t_id_x) % Q_1D;
+
+        if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Load coefficients
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    __syncthreads();
+    for (CeedInt dim = 0; dim < 2; dim++) {
+      // Contract x direction
+      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = 0.0;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+        }
+      }
+      // Contract y direction
+      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Clear shared memory
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    __syncthreads();
+    for (CeedInt dim = 0; dim < 2; dim++) {
+      // Contract y direction
+      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
+
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = chebyshev_x[i] * r_u;
+      }
+      // Contract x direction
+      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        // Note: shifting to avoid atomic adds
+        const CeedInt ii = (i + data.t_id_y) % Q_1D;
+
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          const CeedInt jj = (j + data.t_id_x) % Q_1D;
+
+          if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+        }
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 3D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Get z contraction value
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      // Load coefficients
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
+      __syncthreads();
+      // Contract x direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = 0.0;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+        }
+      }
+      // Contract y and z direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp] += chebyshev_x[i] * buffer[i] * z;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Get z contraction value
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      // Clear shared memory
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+      __syncthreads();
+      // Contract y and z direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
+
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = chebyshev_x[i] * r_u * z;
+      }
+      // Contract x direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        // Note: shifting to avoid atomic adds
+        const CeedInt ii = (i + data.t_id_y) % Q_1D;
+
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          const CeedInt jj = (j + data.t_id_x) % Q_1D;
+
+          if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+        }
+      }
+      // Pull from shared to register
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Get z contraction values
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar dz = chebyshev_x[k];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      // Load coefficients
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
+      __syncthreads();
+      // Gradient directions
+      for (CeedInt dim = 0; dim < 3; dim++) {
+        // Contract x direction
+        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          buffer[i] = 0.0;
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+          }
+        }
+        // Contract y and z direction
+        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        const CeedScalar zz = dim == 2 ? dz : z;
+
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
+inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  for (CeedInt k = 0; k < Q_1D; k++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Get z contraction values
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar dz = chebyshev_x[k];
+
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      // Clear shared memory
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+      __syncthreads();
+      // Gradient directions
+      for (CeedInt dim = 0; dim < 3; dim++) {
+        // Contract y and z direction
+        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        const CeedScalar zz  = dim == 2 ? dz : z;
+        const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
+
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          buffer[i] = chebyshev_x[i] * r_u * zz;
+        }
+        // Contract x direction
+        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          // Note: shifting to avoid atomic adds
+          const CeedInt ii = (i + data.t_id_y) % Q_1D;
+
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            const CeedInt jj = (j + data.t_id_x) % Q_1D;
+
+            if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          }
+        }
+      }
+      // Pull from shared to register
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    }
+  }
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
new file mode 100644
index 0000000000..f30e6070c4
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
@@ -0,0 +1,396 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP tensor product basis with AtPoints evaluation
+#include <ceed/types.h>
+
+#include "hip-shared-basis-read-write-templates.h"
+#include "hip-shared-basis-tensor-at-points-templates.h"
+#include "hip-shared-basis-tensor-templates.h"
+
+//------------------------------------------------------------------------------
+// Tensor Basis Kernels AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interp
+//------------------------------------------------------------------------------
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
+                        const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map to coefficients
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
+    }
+
+    // Map to points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      if (BASIS_DIM == 1) {
+        InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 2) {
+        InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 3) {
+        InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      }
+      WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
+                                 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Clear output vector
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0;
+    if (BASIS_DIM == 1) {
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+
+    // Map from points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
+      if (BASIS_DIM == 1) {
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
+                                    const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Map from points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
+      if (BASIS_DIM == 1) {
+        InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void GradAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
+                      const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map to coefficients
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C);
+    }
+
+    // Map to points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      if (BASIS_DIM == 1) {
+        GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 2) {
+        GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      } else if (BASIS_DIM == 3) {
+        GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
+      }
+      WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
+                               const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Clear output vector
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0;
+    if (BASIS_DIM == 1) {
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+
+    // Map from points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
+                                                           r_U);
+      if (BASIS_DIM == 1) {
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X,
+                                  const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_X[BASIS_DIM];
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Clear register
+    for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
+
+    // Map from points
+    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
+
+    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
+      const CeedInt p = i % BASIS_NUM_PTS;
+
+      ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
+      ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
+                                                           r_U);
+      if (BASIS_DIM == 1) {
+        GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 2) {
+        GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      } else if (BASIS_DIM == 3) {
+        GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
+      }
+    }
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
new file mode 100644
index 0000000000..bba3c2f8a1
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h
@@ -0,0 +1,677 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP shared memory tensor product basis templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// 2D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 2D tensor contraction x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                            CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
+                                            CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D transpose tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D transpose tensor contract x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < P_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D transpose tensor contract and add x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
+                                                        const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < P_1D && t_id_y < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D pack/unpack quadrature values
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+  const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
+  }
+}
+
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+  const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
+    if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate to quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                       CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+  CeedScalar r_t[1];
+
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                     const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
+
+  if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 2D quadrature weights
+//------------------------------------------------------------------------------
+template <int P_1D, int Q_1D>
+inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+  const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
+
+  *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
+}
+
+//------------------------------------------------------------------------------
+// 3D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 3D tensor contract x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                            const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                            const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D tensor contract z
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                            const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
+    for (CeedInt i = 0; i < P_1D; i++) {
+      *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D tensor contract z
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract z
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
+                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract y
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
+                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
+                                                     const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  *V = 0.0;
+  if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D transpose tensor contract add x
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
+                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
+  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
+  __syncthreads();
+  if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D pack/unpack quadrature values
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
+  const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
+  }
+}
+
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
+  const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
+
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    __syncthreads();
+    if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate to quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                       CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+                                             CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
+    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
+    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
+    ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+  CeedScalar    r_t1[1], r_t2[1];
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
+    ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
+    ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
+    ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
+    ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
+    ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
+    ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                                     const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
+
+  if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
+  }
+  __syncthreads();
+  if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
+}
+
+//------------------------------------------------------------------------------
+// 3D quadrature weights
+//------------------------------------------------------------------------------
+template <int P_1D, int Q_1D>
+inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+  const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
+
+  *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
index 5e52d1c829..ada945ed1e 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP shared memory tensor product basis templates
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // 1D
@@ -19,6 +18,7 @@
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x] = *U;
   __syncthreads();
   *V = 0.0;
@@ -27,7 +27,6 @@ inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, co
       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
@@ -35,6 +34,7 @@ inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, co
 //------------------------------------------------------------------------------
 template <int NUM_COMP, int P_1D, int Q_1D>
 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x] = *U;
   __syncthreads();
   *V = 0.0;
@@ -43,56 +43,77 @@ inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScal
       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
 // 1D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
+    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 1D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                          CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp);
+    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                               CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
   }
 }
 
 //------------------------------------------------------------------------------
 // 1D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                               CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
+    ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 1D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                        CeedScalar *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp);
+    ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 1D quadrature weights
 //------------------------------------------------------------------------------
-template <int Q_1D>
+template <int P_1D, int Q_1D>
 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
 }
@@ -104,8 +125,9 @@ inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restri
 //------------------------------------------------------------------------------
 // 2D tensor contraction x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -114,14 +136,14 @@ inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, co
       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
 // 2D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -130,14 +152,14 @@ inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, co
       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -146,14 +168,14 @@ inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScal
       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   *V = 0.0;
@@ -162,14 +184,14 @@ inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScal
       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract and add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+  __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
@@ -177,68 +199,113 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedS
       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
     }
   }
-  __syncthreads();
 }
 
 //------------------------------------------------------------------------------
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                     CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    r_V[comp] = r_U[comp];
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                     CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0 * NUM_COMP);
-    ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t);
-    ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1 * NUM_COMP);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                              CeedScalar *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0 * NUM_COMP, c_B, r_t);
-    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp);
-    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1 * NUM_COMP, c_G, r_t);
-    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
+    ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                   const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
+    ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
+    ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
-template <int Q_1D>
+template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
 }
@@ -250,7 +317,7 @@ inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__
 //------------------------------------------------------------------------------
 // 3D tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[P_1D];
   for (CeedInt i = 0; i < P_1D; i++) {
@@ -258,6 +325,7 @@ inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, co
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -266,14 +334,13 @@ inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, co
         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
       }
     }
-    __syncthreads();
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[P_1D];
   for (CeedInt i = 0; i < P_1D; i++) {
@@ -281,6 +348,7 @@ inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, co
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -289,14 +357,13 @@ inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, co
         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
       }
     }
-    __syncthreads();
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   for (CeedInt k = 0; k < Q_1D; k++) {
     V[k] = 0.0;
@@ -311,7 +378,7 @@ inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, co
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   for (CeedInt k = 0; k < P_1D; k++) {
     V[k] = 0.0;
@@ -326,7 +393,7 @@ inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScal
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -334,6 +401,7 @@ inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScal
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -342,14 +410,13 @@ inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScal
         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
       }
     }
-    __syncthreads();
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -357,6 +424,7 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedS
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
@@ -364,14 +432,13 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedS
         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
       }
     }
-    __syncthreads();
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -379,6 +446,7 @@ inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScal
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     V[k] = 0.0;
@@ -387,14 +455,13 @@ inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScal
         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
       }
     }
-    __syncthreads();
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
@@ -402,6 +469,7 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedS
   }
 
   for (CeedInt k = 0; k < P_1D; k++) {
+    __syncthreads();
     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
     __syncthreads();
     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
@@ -409,121 +477,172 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedS
         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
       }
     }
-    __syncthreads();
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D, c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate to quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                     CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < Q_1D; i++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_V[i + comp * Q_1D] = r_U[i + comp * P_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                              CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < Q_1D; i++) {
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+      r_V[i + comp * P_1D] = r_U[i + comp * Q_1D];
+    }
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                     CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_G, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                              CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * P_1D);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                               CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1);
-    ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
-    ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D);
-    ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D);
-    ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D);
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
+    ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2);
-    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1);
-    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2);
-    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D);
+    ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
+    ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
+    ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at quadrature points, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                   const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
+    ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
+    ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose, nodes and quadrature points collocated
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+                                                            const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
+    ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
+    ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
   }
 }
 
 //------------------------------------------------------------------------------
 // 3D quadrature weights
 //------------------------------------------------------------------------------
-template <int Q_1D>
+template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
index 0a9a1f3cee..9e1d3b5263 100644
--- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP shared memory tensor product basis
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "hip-shared-basis-read-write-templates.h"
 #include "hip-shared-basis-tensor-templates.h"
@@ -17,168 +16,702 @@
 // Interp kernel by dim
 //------------------------------------------------------------------------------
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
-    void Interp(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+    void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
-  // load interp_1d into shared memory
-  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
-  __syncthreads();
-
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+    Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+    InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+    InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                          BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
   }
+#endif
 }
 
 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
-    void InterpTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+    void InterpCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
-  // load interp_1d into shared memory
-  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
-  __syncthreads();
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
+    }
+  }
+#else
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
+    }
+  }
+#endif
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
 
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+    InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                          BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
   }
+#endif
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    }
+  }
+#else
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    }
+  }
+#endif
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // load interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __syncthreads();
+
+  // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+    InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+    if (elem < num_elem) SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  }
+#else
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+#endif
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
+                                      CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (elem < num_elem) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    }
+  }
+#else
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                       BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
+    }
+  }
+#endif
 }
 
 //------------------------------------------------------------------------------
 // Grad kernel by dim
 //------------------------------------------------------------------------------
-extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
-    void Grad(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U,
-              CeedScalar *__restrict__ d_V) {
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G,
+                                                                         const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
   // load interp_1d and grad_1d into shared memory
   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
-  loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G);
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
   __syncthreads();
 
+  // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+    Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+    GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
+                                                                    d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+    if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
+  }
+#else
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
+                                                                    d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
+    }
+  }
+#endif
+}
+
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+    void GradCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                        CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
 
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+    Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+    GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
+                                                                    d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+    GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
-      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
                                                                     d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
-      else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
     }
   }
+#endif
 }
 
 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
-    void GradTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U,
+    void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
                        CeedScalar *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
   // load interp_1d and grad_1d into shared memory
   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
-  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
-  loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G);
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
   __syncthreads();
 
+  // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+    GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+    GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                          BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  }
+#else
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+#endif
+}
+
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+    void GradCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                 CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
   SharedData_Hip data;
   data.t_id_x = threadIdx.x;
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
 
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+    GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+    GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                          BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
-      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 2) {
       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
                                                                    r_U);
-      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     } else if (BASIS_DIM == 3) {
       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
-      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
-      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
+      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
     }
   }
+#endif
+}
+
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+    void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                          CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+    GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+    GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+    if (elem < num_elem) SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  }
+#else
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+#endif
+}
+
+extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
+    void GradCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
+                                    CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+
+  // load interp_1d and grad_1d into shared memory
+  __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
+  LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
+  __syncthreads();
+
+  // Apply basis element by element
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    if (elem < num_elem) ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+    GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 2) {
+    if (elem < num_elem) ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+    GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  } else if (BASIS_DIM == 3) {
+    if (elem < num_elem) ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+    GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+    if (elem < num_elem) SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+  }
+#else
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
+      GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
+                                                                   r_U);
+      GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
+                                                                   BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
+      GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+#endif
 }
 
 //------------------------------------------------------------------------------
@@ -193,21 +726,36 @@ extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
   data.t_id_y = threadIdx.y;
   data.t_id_z = threadIdx.z;
   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
-  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+  data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
 
   CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1];
 
+#ifdef __HIP_PLATFORM_SPIRV__
+  CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z;
+  if (BASIS_DIM == 1) {
+    Weight1d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
+    if (elem < num_elem) WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W);
+  } else if (BASIS_DIM == 2) {
+    WeightTensor2d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
+    if (elem < num_elem) WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W);
+  } else if (BASIS_DIM == 3) {
+    WeightTensor3d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
+    if (elem < num_elem) WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W,
+                                           d_W);
+  }
+#else
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
     if (BASIS_DIM == 1) {
-      Weight1d<BASIS_Q_1D>(data, q_weight_1d, r_W);
+      Weight1d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
       WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W);
     } else if (BASIS_DIM == 2) {
-      WeightTensor2d<BASIS_Q_1D>(data, q_weight_1d, r_W);
+      WeightTensor2d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
       WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W);
     } else if (BASIS_DIM == 3) {
-      WeightTensor3d<BASIS_Q_1D>(data, q_weight_1d, r_W);
+      WeightTensor3d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
       WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W,
                                            d_W);
     }
   }
+#endif
 }
diff --git a/include/ceed/jit-source/hip/hip-types.h b/include/ceed/jit-source/hip/hip-types.h
index 0042199c8b..ebe689c094 100644
--- a/include/ceed/jit-source/hip/hip-types.h
+++ b/include/ceed/jit-source/hip/hip-types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for HIP type definitions
-#ifndef CEED_HIP_TYPES_H
-#define CEED_HIP_TYPES_H
+#pragma once
 
 #include <ceed/types.h>
 
@@ -24,6 +23,13 @@ typedef struct {
   CeedInt *outputs[CEED_HIP_NUMBER_FIELDS];
 } FieldsInt_Hip;
 
+typedef struct {
+  CeedInt           num_elem;
+  const CeedInt    *num_per_elem;
+  const CeedInt    *indices;
+  const CeedScalar *coords;
+} Points_Hip;
+
 typedef struct {
   CeedInt     t_id_x;
   CeedInt     t_id_y;
@@ -31,5 +37,3 @@ typedef struct {
   CeedInt     t_id;
   CeedScalar *slice;
 } SharedData_Hip;
-
-#endif  // CEED_HIP_TYPES_H
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-1d.h b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
index dd21682225..ed2aceb69a 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-1d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis gradient in 1D
-
 #include "magma-common-tensor.h"
 
 // macros to abstract access of shared memory and reg. file
@@ -126,3 +125,48 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // write V
   write_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_gradta_1d_kernel(const CeedScalar *dTinterp, const CeedScalar *dTgrad, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar *sU[BASIS_NUM_COMP];
+  CeedScalar *sV[BASIS_NUM_COMP];
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT = (CeedScalar *)shared_data;
+  CeedScalar *sW = sT + BASIS_Q * BASIS_P;
+  sU[0]          = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P);
+  sV[0]          = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q);
+  for (int comp = 1; comp < BASIS_NUM_COMP; comp++) {
+    sU[comp] = sU[comp - 1] + (1 * BASIS_Q);
+    sV[comp] = sV[comp - 1] + (1 * BASIS_P);
+  }
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dTgrad, sT);
+  }
+
+  // read U
+  read_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
+
+  __syncthreads();
+  magma_grad_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_Q, BASIS_P>(sT, sU, sV, tx);
+  __syncthreads();
+
+  // sum into V
+  sum_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-2d.h b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
index 23559716dc..9fda73c657 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-2d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis gradient in 2D
-
 #include "magma-common-tensor.h"
 
 // macros to abstract access of shared memory and reg. file
@@ -188,3 +187,54 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // write V
   write_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
+    void magma_gradta_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // here DIM_U = 1, but might be different for a fused operator
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // here DIM_V = 1, but might be different for a fused operator
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sTinterp = (CeedScalar *)shared_data;
+  CeedScalar *sTgrad   = sTinterp + BASIS_Q * BASIS_P;
+  CeedScalar *sTmp     = sTgrad + BASIS_Q * BASIS_P;
+  sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q);
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dinterp1d, sTinterp);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dgrad1d, sTgrad);
+  }
+  __syncthreads();
+
+  /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_2d_device */
+
+  /* read U (idim = 1 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_2d_device */
+
+  // sum into V
+  sum_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-grad-3d.h b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
index c030f8e9e5..4b835216f2 100644
--- a/include/ceed/jit-source/magma/magma-basis-grad-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-grad-3d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis gradient in 3D
-
 #include "magma-common-tensor.h"
 
 // macros to abstract access of shared memory and reg. file
@@ -225,3 +224,61 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
   // write V
   write_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
+    void magma_gradta_3d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU,
+                                const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // here DIM_U = 1, but might be different for a fused operator
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // here DIM_V = 1, but might be different for a fused operator
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sTinterp = (CeedScalar *)shared_data;
+  CeedScalar *sTgrad   = sTinterp + BASIS_Q * BASIS_P;
+  CeedScalar *sTmp     = sTgrad + BASIS_Q * BASIS_P;
+  sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_Q, (BASIS_Q * BASIS_Q * BASIS_P) + (BASIS_Q * BASIS_P * BASIS_P)));
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dinterp1d, sTinterp);
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dgrad1d, sTgrad);
+  }
+  __syncthreads();
+
+  /* read U (idim = 0 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* then first call (i_DIM = 0, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 0, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+
+  /* read U (idim = 1 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* then second call (i_DIM = 1, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 1, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+
+  /* read U (idim = 2 for dU, i_DIM = 0 for rU) --
+     there is a sync at the end of this function */
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx);
+  /* then third call (i_DIM = 2, i_DIM_U = 0, i_DIM_V = 0) */
+  magma_grad_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P, 2, 0, 0, true>(sTinterp, sTgrad, rU, rV, tx, rTmp, sTmp);
+  /* there is a sync at the end of magma_grad_3d_device */
+
+  // sum into V
+  sum_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV + (0 * dstrdV), cstrdV, rV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-1d.h b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
index ae8d082653..531b9273e2 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-1d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis interpolation in 1D
-
 #include "magma-common-tensor.h"
 
 // macros to abstract access of shared memory and reg. file
@@ -126,3 +125,48 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // write V
   write_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interpta_1d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                  const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar *sU[BASIS_NUM_COMP];
+  CeedScalar *sV[BASIS_NUM_COMP];
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT = (CeedScalar *)shared_data;
+  CeedScalar *sW = sT + BASIS_Q * BASIS_P;
+  sU[0]          = sW + ty * BASIS_NUM_COMP * (BASIS_Q + BASIS_P);
+  sV[0]          = sU[0] + (BASIS_NUM_COMP * 1 * BASIS_Q);
+  for (int comp = 1; comp < BASIS_NUM_COMP; comp++) {
+    sU[comp] = sU[comp - 1] + (1 * BASIS_Q);
+    sV[comp] = sV[comp - 1] + (1 * BASIS_P);
+  }
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
+  }
+
+  // read U
+  read_1d<CeedScalar, BASIS_Q, BASIS_NUM_COMP>(dU, cstrdU, sU, tx);
+
+  __syncthreads();
+  magma_interp_1d_device<CeedScalar, BASIS_DIM, BASIS_NUM_COMP, BASIS_Q, BASIS_P>(sT, sU, sV, tx);
+  __syncthreads();
+
+  // sum into V
+  sum_1d<CeedScalar, BASIS_P, BASIS_NUM_COMP>(sV, dV, cstrdV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-2d.h b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
index a2a41a25ae..04640fe75b 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-2d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis interpolation in 1D
-
 #include "magma-common-tensor.h"
 
 // macros to abstract access of shared memory and reg. file
@@ -144,3 +143,44 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_
   // write V
   write_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q, MAGMA_MAXTHREADS_2D)) __global__
+    void magma_interpta_2d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                  const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rTmp                           = 0.0;
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT   = (CeedScalar *)shared_data;
+  CeedScalar *sTmp = sT + BASIS_Q * BASIS_P;
+  sTmp += ty * (BASIS_Q * BASIS_MAX_P_Q);
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
+  }
+
+  // read U -- there is a sync at the end of this function
+  read_U_2d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
+
+  // no sync needed here -- read_U_2d already syncs at the end
+  magma_interp_2d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, rU, rV, tx, rTmp, sTmp);
+  __syncthreads();
+
+  // sum into V
+  sum_V_2d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-3d.h b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
index 50c7e4df4a..004071ee32 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-3d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis interpolation in 3D
-
 #include "magma-common-tensor.h"
 
 // macros to abstract access of shared memory and reg. file
@@ -172,3 +171,44 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MA
   // write V
   write_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_MAX_P_Q *BASIS_MAX_P_Q, MAGMA_MAXTHREADS_3D)) __global__
+    void magma_interpta_3d_kernel(const CeedScalar *dT, const CeedScalar *dU, const int estrdU, const int cstrdU, CeedScalar *dV, const int estrdV,
+                                  const int cstrdV, const int nelem) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data)
+
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int elem_id = (blockIdx.x * blockDim.y) + ty;
+
+  if (elem_id >= nelem) return;
+
+  CeedScalar rU[1][BASIS_NUM_COMP][BASIS_Q] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rV[1][BASIS_NUM_COMP][BASIS_P] = {0.0};  // for a non-fused operator BASIS_DIM is always 1
+  CeedScalar rTmp[BASIS_P]                  = {0.0};
+
+  // shift global memory pointers by elem stride
+  dU += elem_id * estrdU;
+  dV += elem_id * estrdV;
+
+  // assign shared memory pointers
+  CeedScalar *sT   = (CeedScalar *)shared_data;
+  CeedScalar *sTmp = sT + BASIS_Q * BASIS_P;
+  sTmp += ty * (max(BASIS_Q * BASIS_Q * BASIS_MAX_P_Q, BASIS_Q * BASIS_P * BASIS_P));
+
+  // read T
+  if (ty == 0) {
+    read_T_trans_gm2sm<BASIS_Q, BASIS_P>(tx, dT, sT);
+  }
+
+  // read U (idim = 0 for dU, i_DIM = 0 for rU, u_dimstride is always 0)
+  read_U_3d<CeedScalar, BASIS_Q, 1, BASIS_NUM_COMP, BASIS_Q, 0>(dU, cstrdU, rU, sTmp, tx);
+  // there is a sync at the end of this function
+
+  magma_interp_3d_device<CeedScalar, 1, 1, BASIS_NUM_COMP, BASIS_Q, BASIS_P, BASIS_Q, BASIS_P>(sT, rU, rV, tx, rTmp, sTmp);
+  __syncthreads();
+
+  // sum into V
+  sum_V_3d<CeedScalar, BASIS_P, 1, BASIS_NUM_COMP, BASIS_P, 0>(dV, cstrdV, rV, tx);
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
index f5e2df1e90..15f2b90ce6 100644
--- a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA non-tensor basis interpolation
-
 #include "magma-common-nontensor.h"
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -99,6 +98,52 @@ static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, Ce
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+template <typename T, int Q_COMP, int P, int Q, int NB>
+static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC,
+                                                                  CeedScalar *shared_data) {
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = (n + NB - 1) / NB;
+  const int myn     = min(NB, n - id * NB);
+
+  dB += id * Q * NB;
+  dC += id * P * NB;
+
+  // A is P x Q
+  CeedScalar *sA = shared_data;
+  CeedScalar *sB = shared_data + ty * Q * NB;
+
+  CeedScalar rC[NB] = {0.0};
+
+  // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
+  for (int d = 0; d < Q_COMP; d++) {
+    // read A using all threads
+    CeedScalar rA[Q];
+    read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
+    __syncthreads();
+
+    // read B
+    if (id < nblocks) {
+      read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
+    }
+    __syncthreads();
+
+    addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
+
+    dA += P * Q;
+    dB += Q * n;
+
+    __syncthreads();
+  }
+
+  // sum into C
+  if (id < nblocks) {
+    sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 template <typename T, int P, int Q, int NB>
 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
@@ -171,6 +216,42 @@ static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, C
   write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
 }
 
+////////////////////////////////////////////////////////////////////////////////
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
+                                                                   CeedScalar *shared_data) {
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = (n + NB - 1) / NB;
+  const int myn     = min(NB, n - id * NB);
+
+  dB += id * Q * NB;
+  dC += id * P * NB;
+
+  // A is P x Q
+  CeedScalar *sA = shared_data;
+  CeedScalar *sB = shared_data + ty * Q * NB;
+
+  // read A using all threads
+  CeedScalar rA[Q];
+  read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
+  __syncthreads();
+
+  // terminate threads with no work
+  if (id >= nblocks) return;
+
+  // read B
+  read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
+  __syncthreads();
+
+  CeedScalar rC[NB];
+  mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
+
+  // sum into C
+  sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
     void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
@@ -195,6 +276,18 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) _
 #endif
 }
 
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+#if BASIS_Q_COMP_INTERP == 1
+  magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+#else
+  magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+#endif
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
     void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
@@ -218,3 +311,15 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) _
   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
 #endif
 }
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+#if BASIS_Q_COMP_DERIV == 1
+  magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+#else
+  magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+#endif
+}
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-1d.h b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
index 431fbb6d03..d922a7586c 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-1d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-1d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis weight in 1D
-
 #include "magma-common-tensor.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-2d.h b/include/ceed/jit-source/magma/magma-basis-weight-2d.h
index 034992e8f1..9cbb18baae 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-2d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-2d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis weight in 2D
-
 #include "magma-common-tensor.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-3d.h b/include/ceed/jit-source/magma/magma-basis-weight-3d.h
index a5ee73bd96..8fc3e96919 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-3d.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-3d.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA tensor basis weight in 3D
-
 #include "magma-common-tensor.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
index 6a20ecefd6..51cf97d727 100644
--- a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,6 @@
 
 /// @file
 /// Internal header for MAGMA non-tensor basis weight
-
 #include "magma-common-nontensor.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/ceed/jit-source/magma/magma-common-defs.h b/include/ceed/jit-source/magma/magma-common-defs.h
index a4913c2082..22a1b835cb 100644
--- a/include/ceed/jit-source/magma/magma-common-defs.h
+++ b/include/ceed/jit-source/magma/magma-common-defs.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for MAGMA backend common definitions
-#ifndef CEED_MAGMA_COMMON_DEFS_H
-#define CEED_MAGMA_COMMON_DEFS_H
+#pragma once
 
 #define MAGMA_DEVICE_SHARED(type, name) extern __shared__ type name[];
 
@@ -21,5 +20,3 @@
 
 // Define macro for computing the total threads in a block for use with __launch_bounds__()
 #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt))
-
-#endif  // CEED_MAGMA_COMMON_DEFS_H
diff --git a/include/ceed/jit-source/magma/magma-common-nontensor.h b/include/ceed/jit-source/magma/magma-common-nontensor.h
index 730acc6419..8f33484295 100644
--- a/include/ceed/jit-source/magma/magma-common-nontensor.h
+++ b/include/ceed/jit-source/magma/magma-common-nontensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -104,6 +104,25 @@ static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// sum into C from reg. to global
+// C is (P x NB)
+// 1D thread config. with (P x 1) threads
+// no sync at the end of the function
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void sum_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC) {
+  if (n != NB) {
+    for (int i = 0; i < n; i++) {
+      dC[i * P + tx] += rC[i];
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < NB; i++) {
+      dC[i * P + tx] += rC[i];
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // multiply C = A x B using 1D threads in P x 1 config
 // A (P x Q)  in reg., one row per thread
diff --git a/include/ceed/jit-source/magma/magma-common-tensor.h b/include/ceed/jit-source/magma/magma-common-tensor.h
index 6c483abd9d..d0ca2f53c0 100644
--- a/include/ceed/jit-source/magma/magma-common-tensor.h
+++ b/include/ceed/jit-source/magma/magma-common-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -36,6 +36,18 @@ static __device__ __inline__ void write_1d(T *sBuffer[NUM_COMP], T *devptr, cons
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// sum into V of a 1D element into global memory from sV[][] --  for all components
+// the devptr is assumed to point directly to the element
+template <typename T, int LENGTH, int NUM_COMP>
+static __device__ __inline__ void sum_1d(T *sBuffer[NUM_COMP], T *devptr, const int compstride, const int tx) {
+  if (tx < LENGTH) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      devptr[comp * compstride + tx] += sBuffer[comp][tx];
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // read U of a 2D element into registers rU[][][] --  for all components of a single dim
 // dU is assumed to be offset by elem-stride and dim-stride
@@ -107,6 +119,23 @@ static __device__ __inline__ void write_V_2d(T *dV, const int compstride, T rV[D
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// sum into V of a 2D element from registers rV[][][] to global memory --  for all components of a single dim
+// dV is assumed to be offset by elem-stride and dim-stride
+// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
+// i_DIM specifies which dimension is being written to in dV
+// rV_SIZE can be different from P (e.g. max(P, Q))
+template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
+static __device__ __inline__ void sum_V_2d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+  if (tx < Q) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      for (int j = 0; j < Q; j++) {
+        dV[comp * compstride + j * Q + tx] += rV[i_DIM][comp][j];
+      }
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // read U of a 3D element into registers rU[][][] --  for all components of a single dim
 // dU is assumed to be offset by elem-stride and dim-stride
@@ -178,6 +207,23 @@ static __device__ __inline__ void write_V_3d(T *dV, const int compstride, T rV[D
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// sum into V of a 3D element from registers rV[][][] to global memory --  for all components of a single dim
+// dV is assumed to point directly to the element (i.e. already offset by elem-stride)
+// register is assumed to be rV[DIM_V][NUM_COMP][rV_SIZE]
+// i_DIM specifies which dimension is being written to in dV
+// rV_SIZE can be different from P (e.g. max(P, Q))
+template <typename T, int Q, int DIM_V, int NUM_COMP, int rV_SIZE, int i_DIM>
+static __device__ __inline__ void sum_V_3d(T *dV, const int compstride, T rV[DIM_V][NUM_COMP][rV_SIZE], const int tx) {
+  if (tx < (Q * Q)) {
+    for (int comp = 0; comp < NUM_COMP; comp++) {
+      for (int j = 0; j < Q; j++) {
+        dV[comp * compstride + j * (Q * Q) + tx] += rV[i_DIM][comp][j];
+      }
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // reads T (no-trans) into shared memory
 // T is B x J
diff --git a/include/ceed/jit-source/sycl/sycl-gen-templates.h b/include/ceed/jit-source/sycl/sycl-gen-templates.h
index aa54232c2d..5dada5b9eb 100644
--- a/include/ceed/jit-source/sycl/sycl-gen-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-gen-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,7 +7,7 @@
 
 /// @file
 /// Internal header for SYCL backend macro and type definitions for JiT source
-#include <ceed.h>
+#include <ceed/types.h>
 
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
diff --git a/include/ceed/jit-source/sycl/sycl-jit.h b/include/ceed/jit-source/sycl/sycl-jit.h
index f4824d8a34..25837f5701 100644
--- a/include/ceed/jit-source/sycl/sycl-jit.h
+++ b/include/ceed/jit-source/sycl/sycl-jit.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
index d62de2533a..9f5df69e68 100644
--- a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
+++ b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for SYCL backend QFunction read/write kernels
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Read from quadrature points
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h b/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
index 421875b509..551789e48b 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-read-write-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,9 +7,7 @@
 
 /// @file
 /// Internal header for SYCL shared memory basis read/write templates
-
-#include <ceed.h>
-#include "sycl-types.h"
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // Helper function: load matrices for basis actions
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
index 28bd24d9f9..f023b77d6b 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for SYCL shared memory tensor product basis templates
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 //------------------------------------------------------------------------------
 // 1D
diff --git a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
index f8e4ccdc0a..71f60cce8b 100644
--- a/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
+++ b/include/ceed/jit-source/sycl/sycl-shared-basis-tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -7,8 +7,7 @@
 
 /// @file
 /// Internal header for SYCL shared memory tensor product basis
-
-#include <ceed.h>
+#include <ceed/types.h>
 
 #include "sycl-shared-basis-read-write-templates.h"
 #include "sycl-shared-basis-tensor-templates.h"
diff --git a/include/ceed/jit-source/sycl/sycl-types.h b/include/ceed/jit-source/sycl/sycl-types.h
index 58938a4b2a..5133c6eee8 100644
--- a/include/ceed/jit-source/sycl/sycl-types.h
+++ b/include/ceed/jit-source/sycl/sycl-types.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/jit-tools.h b/include/ceed/jit-tools.h
index 60e0795f50..c82a9ad075 100644
--- a/include/ceed/jit-tools.h
+++ b/include/ceed/jit-tools.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
diff --git a/include/ceed/types.h b/include/ceed/types.h
index 6817a73322..c687c218f9 100644
--- a/include/ceed/types.h
+++ b/include/ceed/types.h
@@ -1,4 +1,4 @@
-/// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+/// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 /// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 ///
 /// SPDX-License-Identifier: BSD-2-Clause
@@ -10,8 +10,10 @@
 #ifndef CEED_QFUNCTION_DEFS_H
 #define CEED_QFUNCTION_DEFS_H
 
+#ifndef CEED_RUNNING_JIT_PASS
 #include <stddef.h>
 #include <stdint.h>
+#endif
 
 /**
   @ingroup CeedQFunction
@@ -23,7 +25,7 @@
 #ifndef __NO_INLINE__
 #if defined(__GNUC__) || defined(__clang__)
 #define CEED_QFUNCTION_ATTR __attribute__((flatten))
-#elif defined(__INTEL_COMPILER)
+#elif defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
 #define CEED_QFUNCTION_ATTR _Pragma("forceinline")
 #else
 #define CEED_QFUNCTION_ATTR
@@ -49,6 +51,33 @@ backends. It also creates a variable `name_loc` populated with the correct sourc
   CEED_QFUNCTION_ATTR static int name
 #endif
 
+/**
+    @ingroup CeedQFunction
+This macro populates the correct function for Rust-based User QFunction source for code generation backends or populates default values for CPU backends. It also creates a variable `name_loc` populated with the correct source path for creating the respective User QFunction. Note that the function, as named in rust, must be called `name_rs`. When referencing it in C, use just `name` (no `_rs`)
+Example:
+//ex1-volume.h
+CEED_QFUNCTION_RUST(build_mass)
+//ex1-volume.c
+CeedAddRustSourceRoot(ceed, "examples/ceed/ex1-volume-rs");
+// ex1-volume-rs/src/lib.rs
+#[no_mangle]
+pub unsafe extern "C" fn build_mass_rs(
+    ctx: *mut c_void,
+    Q: i32,
+    in: *const *const f64,
+    out: *mut *mut f64,
+) -> i8
+**/
+#ifndef CEED_QFUNCTION_RUST
+#define CEED_QFUNCTION_RUST(name)                                                                                            \
+  CEED_QFUNCTION_ATTR int        name##_rs(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out); \
+  CEED_QFUNCTION_ATTR static int name(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {     \
+    return name##_rs(ctx, Q, in, out);                                                                                       \
+  }                                                                                                                          \
+  static const char name##_loc[] = __FILE__ ":" #name;
+#endif
+// Note: placing the _loc of the function below the function in the macro is required because python cffi will exclude the previous line (the }) based on the backslash at the end of it, which is required for our python build script to exclude macros. See /python/build_ceed_cffi.py for more details
+
 /**
   @ingroup CeedQFunction
   This macro populates the correct function annotations for User QFunction helper function source for code generation backends or populates default
@@ -74,7 +103,7 @@ values for CPU backends.
     Code generation backends may redefine this macro, as needed.
 **/
 #ifndef CeedPragmaSIMD
-#if defined(__INTEL_COMPILER)
+#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
 #define CeedPragmaSIMD _Pragma("vector")
 /// Cannot use Intel pragma ivdep because it miscompiles unpacking symmetric tensors, as in Poisson2DApply, where the SIMD loop body contains
 /// temporaries such as the following.
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 5618a087ab..23bf05c419 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -153,29 +153,59 @@ static int CeedGivensRotation(CeedScalar *A, CeedScalar c, CeedScalar s, CeedTra
   @param[in] m      Number of rows in array
   @param[in] n      Number of columns in array
   @param[in] a      Array to be viewed
+  @param[in] tabs   Tabs to append before each new line
   @param[in] stream Stream to view to, e.g., `stdout`
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref Developer
 **/
-static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, CeedInt n, const CeedScalar *a, FILE *stream) {
+static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, CeedInt n, const CeedScalar *a, const char *tabs, FILE *stream) {
   if (m > 1) {
-    fprintf(stream, "  %s:\n", name);
+    fprintf(stream, "%s  %s:\n", tabs, name);
   } else {
     char padded_name[12];
 
     snprintf(padded_name, 11, "%s:", name);
-    fprintf(stream, "  %-10s", padded_name);
+    fprintf(stream, "%s  %-10s", tabs, padded_name);
   }
   for (CeedInt i = 0; i < m; i++) {
-    if (m > 1) fprintf(stream, "    [%" CeedInt_FMT "]", i);
+    if (m > 1) fprintf(stream, "%s    [%" CeedInt_FMT "]", tabs, i);
     for (CeedInt j = 0; j < n; j++) fprintf(stream, fp_fmt, fabs(a[i * n + j]) > 1E-14 ? a[i * n + j] : 0);
     fputs("\n", stream);
   }
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief View a `CeedBasis` passed as a `CeedObject`
+
+  @param[in] basis  `CeedBasis` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedBasisView_Object(CeedObject basis, FILE *stream) {
+  CeedCall(CeedBasisView((CeedBasis)basis, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Destroy a `CeedBasis` passed as a `CeedObject`
+
+  @param[in,out] basis Address of `CeedBasis` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedBasisDestroy_Object(CeedObject *basis) {
+  CeedCall(CeedBasisDestroy((CeedBasis *)basis));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Create the interpolation and gradient matrices for projection from the nodes of `basis_from` to the nodes of `basis_to`.
 
@@ -194,23 +224,27 @@ static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, CeedI
   @ref Developer
 **/
 static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis basis_to, CeedScalar **interp_project, CeedScalar **grad_project) {
-  Ceed    ceed;
-  bool    is_tensor_to, is_tensor_from;
+  bool    are_both_tensor;
   CeedInt Q, Q_to, Q_from, P_to, P_from;
 
-  CeedCall(CeedBasisGetCeed(basis_to, &ceed));
-
   // Check for compatible quadrature spaces
   CeedCall(CeedBasisGetNumQuadraturePoints(basis_to, &Q_to));
   CeedCall(CeedBasisGetNumQuadraturePoints(basis_from, &Q_from));
-  CeedCheck(Q_to == Q_from, ceed, CEED_ERROR_DIMENSION, "Bases must have compatible quadrature spaces");
+  CeedCheck(Q_to == Q_from, CeedBasisReturnCeed(basis_to), CEED_ERROR_DIMENSION,
+            "Bases must have compatible quadrature spaces."
+            " 'basis_from' has %" CeedInt_FMT " points and 'basis_to' has %" CeedInt_FMT,
+            Q_from, Q_to);
   Q = Q_to;
 
   // Check for matching tensor or non-tensor
-  CeedCall(CeedBasisIsTensor(basis_to, &is_tensor_to));
-  CeedCall(CeedBasisIsTensor(basis_from, &is_tensor_from));
-  CeedCheck(is_tensor_to == is_tensor_from, ceed, CEED_ERROR_MINOR, "Bases must both be tensor or non-tensor");
-  if (is_tensor_to) {
+  {
+    bool is_tensor_to, is_tensor_from;
+
+    CeedCall(CeedBasisIsTensor(basis_to, &is_tensor_to));
+    CeedCall(CeedBasisIsTensor(basis_from, &is_tensor_from));
+    are_both_tensor = is_tensor_to && is_tensor_from;
+  }
+  if (are_both_tensor) {
     CeedCall(CeedBasisGetNumNodes1D(basis_to, &P_to));
     CeedCall(CeedBasisGetNumNodes1D(basis_from, &P_from));
     CeedCall(CeedBasisGetNumQuadraturePoints1D(basis_from, &Q));
@@ -221,17 +255,21 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
 
   // Check for matching FE space
   CeedFESpace fe_space_to, fe_space_from;
+
   CeedCall(CeedBasisGetFESpace(basis_to, &fe_space_to));
   CeedCall(CeedBasisGetFESpace(basis_from, &fe_space_from));
-  CeedCheck(fe_space_to == fe_space_from, ceed, CEED_ERROR_MINOR, "Bases must both be the same FE space type");
+  CeedCheck(fe_space_to == fe_space_from, CeedBasisReturnCeed(basis_to), CEED_ERROR_MINOR,
+            "Bases must both be the same FE space type."
+            " 'basis_from' is a %s and 'basis_to' is a %s",
+            CeedFESpaces[fe_space_from], CeedFESpaces[fe_space_to]);
 
   // Get source matrices
   CeedInt           dim, q_comp = 1;
   CeedScalar       *interp_to_inv, *interp_from;
   const CeedScalar *interp_to_source = NULL, *interp_from_source = NULL, *grad_from_source = NULL;
 
-  CeedCall(CeedBasisGetDimension(basis_to, &dim));
-  if (is_tensor_to) {
+  CeedCall(CeedBasisGetDimension(basis_from, &dim));
+  if (are_both_tensor) {
     CeedCall(CeedBasisGetInterp1D(basis_to, &interp_to_source));
     CeedCall(CeedBasisGetInterp1D(basis_from, &interp_from_source));
   } else {
@@ -246,19 +284,19 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
   // projection basis will have a gradient operation (allocated even if not H^1 for the
   // basis construction later on)
   if (fe_space_to == CEED_FE_SPACE_H1) {
-    if (is_tensor_to) {
+    if (are_both_tensor) {
       CeedCall(CeedBasisGetGrad1D(basis_from, &grad_from_source));
     } else {
       CeedCall(CeedBasisGetGrad(basis_from, &grad_from_source));
     }
   }
-  CeedCall(CeedCalloc(P_to * P_from * (is_tensor_to ? 1 : dim), grad_project));
+  CeedCall(CeedCalloc(P_to * P_from * (are_both_tensor ? 1 : dim), grad_project));
 
   // Compute interp_to^+, pseudoinverse of interp_to
   CeedCall(CeedCalloc(Q * q_comp * P_to, &interp_to_inv));
-  CeedCall(CeedMatrixPseudoinverse(ceed, interp_to_source, Q * q_comp, P_to, interp_to_inv));
+  CeedCall(CeedMatrixPseudoinverse(CeedBasisReturnCeed(basis_to), interp_to_source, Q * q_comp, P_to, interp_to_inv));
   // Build matrices
-  CeedInt     num_matrices = 1 + (fe_space_to == CEED_FE_SPACE_H1) * (is_tensor_to ? 1 : dim);
+  CeedInt     num_matrices = 1 + (fe_space_to == CEED_FE_SPACE_H1) * (are_both_tensor ? 1 : dim);
   CeedScalar *input_from[num_matrices], *output_project[num_matrices];
 
   input_from[0]     = (CeedScalar *)interp_from_source;
@@ -270,7 +308,7 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
   for (CeedInt m = 0; m < num_matrices; m++) {
     // output_project = interp_to^+ * interp_from
     memcpy(interp_from, input_from[m], Q * P_from * q_comp * sizeof(input_from[m][0]));
-    CeedCall(CeedMatrixMatrixMultiply(ceed, interp_to_inv, input_from[m], output_project[m], P_to, P_from, Q * q_comp));
+    CeedCall(CeedMatrixMatrixMultiply(CeedBasisReturnCeed(basis_to), interp_to_inv, input_from[m], output_project[m], P_to, P_from, Q * q_comp));
     // Round zero to machine precision
     for (CeedInt i = 0; i < P_to * P_from; i++) {
       if (fabs(output_project[m][i]) < 10 * CEED_EPSILON) output_project[m][i] = 0.0;
@@ -283,161 +321,625 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis bas
   return CEED_ERROR_SUCCESS;
 }
 
-/// @}
-
-/// ----------------------------------------------------------------------------
-/// Ceed Backend API
-/// ----------------------------------------------------------------------------
-/// @addtogroup CeedBasisBackend
-/// @{
-
 /**
-  @brief Return collocated gradient matrix
+  @brief Check input vector dimensions for CeedBasisApply[Add]
 
-  @param[in]  basis         `CeedBasis`
-  @param[out] collo_grad_1d Row-major (`Q_1d * Q_1d`) matrix expressing derivatives of basis functions at quadrature points
+  @param[in]  basis     `CeedBasis` to evaluate
+  @param[in]  num_elem  The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  t_mode    @ref CEED_NOTRANSPOSE to evaluate from nodes to quadrature points;
+                          @ref CEED_TRANSPOSE to apply the transpose, mapping from quadrature points to nodes
+  @param[in]  eval_mode @ref CEED_EVAL_NONE to use values directly,
+                          @ref CEED_EVAL_INTERP to use interpolated values,
+                          @ref CEED_EVAL_GRAD to use gradients,
+                          @ref CEED_EVAL_DIV to use divergence,
+                          @ref CEED_EVAL_CURL to use curl,
+                          @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  u         Input `CeedVector`
+  @param[out] v         Output `CeedVector`
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref Backend
+  @ref Developer
 **/
-int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *collo_grad_1d) {
-  Ceed              ceed;
-  CeedInt           P_1d, Q_1d;
-  CeedScalar       *interp_1d_pinv;
-  const CeedScalar *grad_1d, *interp_1d;
-
-  // Note: This function is for backend use, so all errors are terminal and we do not need to clean up memory on failure.
-  CeedCall(CeedBasisGetCeed(basis, &ceed));
-  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
-  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
-
-  // Compute interp_1d^+, pseudoinverse of interp_1d
-  CeedCall(CeedCalloc(P_1d * Q_1d, &interp_1d_pinv));
-  CeedCall(CeedBasisGetInterp1D(basis, &interp_1d));
-  CeedCall(CeedMatrixPseudoinverse(ceed, interp_1d, Q_1d, P_1d, interp_1d_pinv));
-  CeedCall(CeedBasisGetGrad1D(basis, &grad_1d));
-  CeedCall(CeedMatrixMatrixMultiply(ceed, grad_1d, (const CeedScalar *)interp_1d_pinv, collo_grad_1d, Q_1d, Q_1d, P_1d));
-
-  CeedCall(CeedFree(&interp_1d_pinv));
-  return CEED_ERROR_SUCCESS;
-}
-
-/**
-  @brief Get tensor status for given `CeedBasis`
-
-  @param[in]  basis     `CeedBasis`
-  @param[out] is_tensor Variable to store tensor status
+static int CeedBasisApplyCheckDims(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  CeedInt  dim, num_comp, q_comp, num_nodes, num_qpts;
+  CeedSize u_length = 0, v_length;
 
-  @return An error code: 0 - success, otherwise - failure
+  CeedCall(CeedBasisGetDimension(basis, &dim));
+  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
+  CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+  CeedCall(CeedBasisGetNumNodes(basis, &num_nodes));
+  CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
+  CeedCall(CeedVectorGetLength(v, &v_length));
+  if (u) CeedCall(CeedVectorGetLength(u, &u_length));
 
-  @ref Backend
-**/
-int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor) {
-  *is_tensor = basis->is_tensor_basis;
+  // Check vector lengths to prevent out of bounds issues
+  bool has_good_dims = true;
+  switch (eval_mode) {
+    case CEED_EVAL_NONE:
+    case CEED_EVAL_INTERP:
+    case CEED_EVAL_GRAD:
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      has_good_dims = ((t_mode == CEED_TRANSPOSE && u_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_qpts * (CeedSize)q_comp &&
+                        v_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes) ||
+                       (t_mode == CEED_NOTRANSPOSE && v_length >= (CeedSize)num_elem * (CeedSize)num_qpts * (CeedSize)num_comp * (CeedSize)q_comp &&
+                        u_length >= (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes));
+      break;
+    case CEED_EVAL_WEIGHT:
+      has_good_dims = v_length >= (CeedSize)num_elem * (CeedSize)num_qpts;
+      break;
+  }
+  CeedCheck(has_good_dims, CeedBasisReturnCeed(basis), CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
   return CEED_ERROR_SUCCESS;
 }
 
 /**
-  @brief Get backend data of a `CeedBasis`
+  @brief Check input vector dimensions for CeedBasisApply[Add]AtPoints
 
-  @param[in]  basis `CeedBasis`
-  @param[out] data  Variable to store data
+  @param[in]  basis      `CeedBasis` to evaluate
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
+  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
+                           @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
+  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
+                           @ref CEED_EVAL_GRAD to use gradients,
+                           @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
+  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
+  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref Backend
+  @ref Developer
 **/
-int CeedBasisGetData(CeedBasis basis, void *data) {
-  *(void **)data = basis->data;
-  return CEED_ERROR_SUCCESS;
-}
+static int CeedBasisApplyAtPointsCheckDims(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                           CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedInt  dim, num_comp, num_q_comp, num_nodes, P_1d = 1, Q_1d = 1, total_num_points = 0;
+  CeedSize x_length = 0, u_length = 0, v_length;
 
-/**
-  @brief Set backend data of a `CeedBasis`
+  CeedCall(CeedBasisGetDimension(basis, &dim));
+  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
+  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
+  CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &num_q_comp));
+  CeedCall(CeedBasisGetNumNodes(basis, &num_nodes));
+  CeedCall(CeedVectorGetLength(v, &v_length));
+  if (x_ref != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(x_ref, &x_length));
+  if (u != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(u, &u_length));
 
-  @param[in,out] basis  `CeedBasis`
-  @param[in]     data   Data to set
+  // Check compatibility coordinates vector
+  for (CeedInt i = 0; i < num_elem; i++) total_num_points += num_points[i];
+  CeedCheck((x_length >= (CeedSize)total_num_points * (CeedSize)dim) || (eval_mode == CEED_EVAL_WEIGHT), CeedBasisReturnCeed(basis),
+            CEED_ERROR_DIMENSION,
+            "Length of reference coordinate vector incompatible with basis dimension and number of points."
+            " Found reference coordinate vector of length %" CeedSize_FMT ", not of length %" CeedSize_FMT ".",
+            x_length, (CeedSize)total_num_points * (CeedSize)dim);
 
-  @return An error code: 0 - success, otherwise - failure
+  // Check CEED_EVAL_WEIGHT only on CEED_NOTRANSPOSE
+  CeedCheck(eval_mode != CEED_EVAL_WEIGHT || t_mode == CEED_NOTRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED,
+            "CEED_EVAL_WEIGHT only supported with CEED_NOTRANSPOSE");
 
-  @ref Backend
-**/
-int CeedBasisSetData(CeedBasis basis, void *data) {
-  basis->data = data;
+  // Check vector lengths to prevent out of bounds issues
+  bool has_good_dims = true;
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP:
+      has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= (CeedSize)total_num_points * (CeedSize)num_q_comp ||
+                                                     v_length >= (CeedSize)num_elem * (CeedSize)num_nodes * (CeedSize)num_comp)) ||
+                       (t_mode == CEED_NOTRANSPOSE && (v_length >= (CeedSize)total_num_points * (CeedSize)num_q_comp ||
+                                                       u_length >= (CeedSize)num_elem * (CeedSize)num_nodes * (CeedSize)num_comp)));
+      break;
+    case CEED_EVAL_GRAD:
+      has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= (CeedSize)total_num_points * (CeedSize)num_q_comp * (CeedSize)dim ||
+                                                     v_length >= (CeedSize)num_elem * (CeedSize)num_nodes * (CeedSize)num_comp)) ||
+                       (t_mode == CEED_NOTRANSPOSE && (v_length >= (CeedSize)total_num_points * (CeedSize)num_q_comp * (CeedSize)dim ||
+                                                       u_length >= (CeedSize)num_elem * (CeedSize)num_nodes * (CeedSize)num_comp)));
+      break;
+    case CEED_EVAL_WEIGHT:
+      has_good_dims = t_mode == CEED_NOTRANSPOSE && (v_length >= total_num_points);
+      break;
+      // LCOV_EXCL_START
+    case CEED_EVAL_NONE:
+    case CEED_EVAL_DIV:
+    case CEED_EVAL_CURL:
+      return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points not supported for %s",
+                       CeedEvalModes[eval_mode]);
+      // LCOV_EXCL_STOP
+  }
+  CeedCheck(has_good_dims, CeedBasisReturnCeed(basis), CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
   return CEED_ERROR_SUCCESS;
 }
 
 /**
-  @brief Increment the reference counter for a `CeedBasis`
+  @brief Default implimentation to apply basis evaluation from nodes to arbitrary points
 
-  @param[in,out] basis `CeedBasis` to increment the reference counter
+  @param[in]  basis      `CeedBasis` to evaluate
+  @param[in]  apply_add  Sum result into target vector or overwrite
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
+  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
+                           @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
+  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
+                           @ref CEED_EVAL_GRAD to use gradients,
+                           @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
+  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
+  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref Backend
+  @ref Developer
 **/
-int CeedBasisReference(CeedBasis basis) {
-  basis->ref_count++;
-  return CEED_ERROR_SUCCESS;
-}
+static int CeedBasisApplyAtPoints_Core(CeedBasis basis, bool apply_add, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
+                                       CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedInt dim, num_comp, P_1d = 1, Q_1d = 1, total_num_points = num_points[0];
 
-/**
-  @brief Get number of Q-vector components for given `CeedBasis`
+  CeedCall(CeedBasisGetDimension(basis, &dim));
+  // Inserting check because clang-tidy doesn't understand this cannot occur
+  CeedCheck(dim > 0, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Malformed CeedBasis, dim > 0 is required");
+  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
+  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
 
-  @param[in]  basis     `CeedBasis`
-  @param[in]  eval_mode @ref CEED_EVAL_INTERP to use interpolated values,
-                          @ref CEED_EVAL_GRAD to use gradients,
-                          @ref CEED_EVAL_DIV to use divergence,
-                          @ref CEED_EVAL_CURL to use curl
-  @param[out] q_comp    Variable to store number of Q-vector components of basis
+  // Default implementation
+  {
+    bool is_tensor_basis;
 
-  @return An error code: 0 - success, otherwise - failure
+    CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis));
+    CeedCheck(is_tensor_basis, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED,
+              "Evaluation at arbitrary points only supported for tensor product bases");
+  }
+  CeedCheck(num_elem == 1, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED,
+            "Evaluation at arbitrary  points only supported for a single element at a time");
+  if (eval_mode == CEED_EVAL_WEIGHT) {
+    CeedCall(CeedVectorSetValue(v, 1.0));
+    return CEED_ERROR_SUCCESS;
+  }
+  if (!basis->basis_chebyshev) {
+    // Build basis mapping from nodes to Chebyshev coefficients
+    CeedScalar       *chebyshev_interp_1d, *chebyshev_grad_1d, *chebyshev_q_weight_1d;
+    const CeedScalar *q_ref_1d;
+    Ceed              ceed;
 
-  @ref Backend
-**/
-int CeedBasisGetNumQuadratureComponents(CeedBasis basis, CeedEvalMode eval_mode, CeedInt *q_comp) {
-  CeedInt dim;
+    CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+    CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_grad_1d));
+    CeedCall(CeedCalloc(Q_1d, &chebyshev_q_weight_1d));
+    CeedCall(CeedBasisGetQRef(basis, &q_ref_1d));
+    CeedCall(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
 
-  CeedCall(CeedBasisGetDimension(basis, &dim));
-  switch (eval_mode) {
-    case CEED_EVAL_INTERP: {
-      CeedFESpace fe_space;
+    CeedCall(CeedBasisGetCeed(basis, &ceed));
+    CeedCall(CeedVectorCreate(ceed, num_comp * CeedIntPow(Q_1d, dim), &basis->vec_chebyshev));
+    CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d, Q_1d, chebyshev_interp_1d, chebyshev_grad_1d, q_ref_1d, chebyshev_q_weight_1d,
+                                     &basis->basis_chebyshev));
 
-      CeedCall(CeedBasisGetFESpace(basis, &fe_space));
-      *q_comp = (fe_space == CEED_FE_SPACE_H1) ? 1 : dim;
-    } break;
-    case CEED_EVAL_GRAD:
-      *q_comp = dim;
-      break;
-    case CEED_EVAL_DIV:
-      *q_comp = 1;
-      break;
-    case CEED_EVAL_CURL:
-      *q_comp = (dim < 3) ? 1 : dim;
-      break;
-    case CEED_EVAL_NONE:
-    case CEED_EVAL_WEIGHT:
-      *q_comp = 1;
-      break;
+    // Cleanup
+    CeedCall(CeedFree(&chebyshev_interp_1d));
+    CeedCall(CeedFree(&chebyshev_grad_1d));
+    CeedCall(CeedFree(&chebyshev_q_weight_1d));
+    CeedCall(CeedDestroy(&ceed));
   }
-  return CEED_ERROR_SUCCESS;
-}
 
-/**
-  @brief Estimate number of FLOPs required to apply `CeedBasis` in `t_mode` and `eval_mode`
+  // Create TensorContract object if needed, such as a basis from the GPU backends
+  if (!basis->contract) {
+    Ceed      ceed_ref;
+    CeedBasis basis_ref = NULL;
 
-  @param[in]  basis     `CeedBasis` to estimate FLOPs for
-  @param[in]  t_mode    Apply basis or transpose
-  @param[in]  eval_mode @ref CeedEvalMode
-  @param[out] flops     Address of variable to hold FLOPs estimate
+    CeedCall(CeedInit("/cpu/self", &ceed_ref));
+    // Only need matching tensor contraction dimensions, any type of basis will work
+    CeedCall(CeedBasisCreateTensorH1Lagrange(ceed_ref, dim, num_comp, P_1d, Q_1d, CEED_GAUSS, &basis_ref));
+    // Note - clang-tidy doesn't know basis_ref->contract must be valid here
+    CeedCheck(basis_ref && basis_ref->contract, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED,
+              "Reference CPU ceed failed to create a tensor contraction object");
+    CeedCall(CeedTensorContractReferenceCopy(basis_ref->contract, &basis->contract));
+    CeedCall(CeedBasisDestroy(&basis_ref));
+    CeedCall(CeedDestroy(&ceed_ref));
+  }
 
-  @ref Backend
-**/
-int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedSize *flops) {
-  bool is_tensor;
+  // Basis evaluation
+  switch (t_mode) {
+    case CEED_NOTRANSPOSE: {
+      // Nodes to arbitrary points
+      CeedScalar       *v_array;
+      const CeedScalar *chebyshev_coeffs, *x_array_read;
+
+      // -- Interpolate to Chebyshev coefficients
+      CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, basis->vec_chebyshev));
+
+      // -- Evaluate Chebyshev polynomials at arbitrary points
+      CeedCall(CeedVectorGetArrayRead(basis->vec_chebyshev, CEED_MEM_HOST, &chebyshev_coeffs));
+      CeedCall(CeedVectorGetArrayRead(x_ref, CEED_MEM_HOST, &x_array_read));
+      CeedCall(CeedVectorGetArrayWrite(v, CEED_MEM_HOST, &v_array));
+      switch (eval_mode) {
+        case CEED_EVAL_INTERP: {
+          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
+
+          // ---- Values at point
+          for (CeedInt p = 0; p < total_num_points; p++) {
+            CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1;
+
+            for (CeedInt d = 0; d < dim; d++) {
+              // ------ Tensor contract with current Chebyshev polynomial values
+              CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+              CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false,
+                                               d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2]));
+              pre /= Q_1d;
+              post *= 1;
+            }
+            for (CeedInt c = 0; c < num_comp; c++) v_array[c * total_num_points + p] = tmp[dim % 2][c];
+          }
+          break;
+        }
+        case CEED_EVAL_GRAD: {
+          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
+
+          // ---- Values at point
+          for (CeedInt p = 0; p < total_num_points; p++) {
+            // Dim**2 contractions, apply grad when pass == dim
+            for (CeedInt pass = 0; pass < dim; pass++) {
+              CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1;
+
+              for (CeedInt d = 0; d < dim; d++) {
+                // ------ Tensor contract with current Chebyshev polynomial values
+                if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+                else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+                CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false,
+                                                 d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2]));
+                pre /= Q_1d;
+                post *= 1;
+              }
+              for (CeedInt c = 0; c < num_comp; c++) v_array[(pass * num_comp + c) * total_num_points + p] = tmp[dim % 2][c];
+            }
+          }
+          break;
+        }
+        default:
+          // Nothing to do, excluded above
+          break;
+      }
+      CeedCall(CeedVectorRestoreArrayRead(basis->vec_chebyshev, &chebyshev_coeffs));
+      CeedCall(CeedVectorRestoreArrayRead(x_ref, &x_array_read));
+      CeedCall(CeedVectorRestoreArray(v, &v_array));
+      break;
+    }
+    case CEED_TRANSPOSE: {
+      // Note: No switch on e_mode here because only CEED_EVAL_INTERP is supported at this time
+      // Arbitrary points to nodes
+      CeedScalar       *chebyshev_coeffs;
+      const CeedScalar *u_array, *x_array_read;
+
+      // -- Transpose of evaluation of Chebyshev polynomials at arbitrary points
+      CeedCall(CeedVectorGetArrayWrite(basis->vec_chebyshev, CEED_MEM_HOST, &chebyshev_coeffs));
+      CeedCall(CeedVectorGetArrayRead(x_ref, CEED_MEM_HOST, &x_array_read));
+      CeedCall(CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array));
+
+      switch (eval_mode) {
+        case CEED_EVAL_INTERP: {
+          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
+
+          // ---- Values at point
+          for (CeedInt p = 0; p < total_num_points; p++) {
+            CeedInt pre = num_comp * 1, post = 1;
+
+            for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[c * total_num_points + p];
+            for (CeedInt d = 0; d < dim; d++) {
+              // ------ Tensor contract with current Chebyshev polynomial values
+              CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+              CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode, p > 0 && d == (dim - 1), tmp[d % 2],
+                                               d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2]));
+              pre /= 1;
+              post *= Q_1d;
+            }
+          }
+          break;
+        }
+        case CEED_EVAL_GRAD: {
+          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
+
+          // ---- Values at point
+          for (CeedInt p = 0; p < total_num_points; p++) {
+            // Dim**2 contractions, apply grad when pass == dim
+            for (CeedInt pass = 0; pass < dim; pass++) {
+              CeedInt pre = num_comp * 1, post = 1;
+
+              for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[(pass * num_comp + c) * total_num_points + p];
+              for (CeedInt d = 0; d < dim; d++) {
+                // ------ Tensor contract with current Chebyshev polynomial values
+                if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+                else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * total_num_points + p], Q_1d, chebyshev_x));
+                CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode,
+                                                 (p > 0 || (p == 0 && pass > 0)) && d == (dim - 1), tmp[d % 2],
+                                                 d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2]));
+                pre /= 1;
+                post *= Q_1d;
+              }
+            }
+          }
+          break;
+        }
+        default:
+          // Nothing to do, excluded above
+          break;
+      }
+      CeedCall(CeedVectorRestoreArray(basis->vec_chebyshev, &chebyshev_coeffs));
+      CeedCall(CeedVectorRestoreArrayRead(x_ref, &x_array_read));
+      CeedCall(CeedVectorRestoreArrayRead(u, &u_array));
+
+      // -- Interpolate transpose from Chebyshev coefficients
+      if (apply_add) CeedCall(CeedBasisApplyAdd(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
+      else CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
+      break;
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/// @}
+
+/// ----------------------------------------------------------------------------
+/// Ceed Backend API
+/// ----------------------------------------------------------------------------
+/// @addtogroup CeedBasisBackend
+/// @{
+
+/**
+  @brief Fallback to a reference implementation for a non tensor-product basis for \f$H^1\f$ discretizations.
+    This function may only be called inside of a backend `BasisCreateH1` function.
+    This is used by a backend when the specific parameters for a `CeedBasis` exceed the backend's support, such as
+    when a `interp` and `grad` matrices require too many bytes to fit into shared memory on a GPU.
+
+  @param[in]  ceed      `Ceed` object used to create the `CeedBasis`
+  @param[in]  topo      Topology of element, e.g. hypercube, simplex, etc
+  @param[in]  num_comp  Number of field components (1 for scalar fields)
+  @param[in]  num_nodes Total number of nodes
+  @param[in]  num_qpts  Total number of quadrature points
+  @param[in]  interp    Row-major (`num_qpts * num_nodes`) matrix expressing the values of nodal basis functions at quadrature points
+  @param[in]  grad      Row-major (`dim * num_qpts * num_nodes`) matrix expressing derivatives of nodal basis functions at quadrature points
+  @param[in]  q_ref     Array of length `num_qpts * dim` holding the locations of quadrature points on the reference element
+  @param[in]  q_weight  Array of length `num_qpts` holding the quadrature weights on the reference element
+  @param[out] basis     Newly created `CeedBasis`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedBasisCreateH1Fallback(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                              const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
+  CeedInt P = num_nodes, Q = num_qpts, dim = 0;
+  Ceed    delegate;
+
+  CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
+  CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateH1");
+
+  CeedCall(CeedReferenceCopy(delegate, &(basis)->obj.ceed));
+  CeedCall(CeedBasisGetTopologyDimension(topo, &dim));
+  CeedCall(delegate->BasisCreateH1(topo, dim, P, Q, interp, grad, q_ref, q_weight, basis));
+  CeedCall(CeedDestroy(&delegate));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Return collocated gradient matrix
+
+  @param[in]  basis         `CeedBasis`
+  @param[out] collo_grad_1d Row-major (`Q_1d * Q_1d`) matrix expressing derivatives of basis functions at quadrature points
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *collo_grad_1d) {
+  Ceed              ceed;
+  CeedInt           P_1d, Q_1d;
+  CeedScalar       *interp_1d_pinv;
+  const CeedScalar *grad_1d, *interp_1d;
+
+  // Note: This function is for backend use, so all errors are terminal and we do not need to clean up memory on failure.
+  CeedCall(CeedBasisGetCeed(basis, &ceed));
+  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
+  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+
+  // Compute interp_1d^+, pseudoinverse of interp_1d
+  CeedCall(CeedCalloc(P_1d * Q_1d, &interp_1d_pinv));
+  CeedCall(CeedBasisGetInterp1D(basis, &interp_1d));
+  CeedCall(CeedMatrixPseudoinverse(ceed, interp_1d, Q_1d, P_1d, interp_1d_pinv));
+  CeedCall(CeedBasisGetGrad1D(basis, &grad_1d));
+  CeedCall(CeedMatrixMatrixMultiply(ceed, grad_1d, (const CeedScalar *)interp_1d_pinv, collo_grad_1d, Q_1d, Q_1d, P_1d));
+
+  CeedCall(CeedFree(&interp_1d_pinv));
+  CeedCall(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Return 1D interpolation matrix to Chebyshev polynomial coefficients on quadrature space
+
+  @param[in]  basis               `CeedBasis`
+  @param[out] chebyshev_interp_1d Row-major (`P_1d * Q_1d`) matrix interpolating from basis nodes to Chebyshev polynomial coefficients
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisGetChebyshevInterp1D(CeedBasis basis, CeedScalar *chebyshev_interp_1d) {
+  CeedInt           P_1d, Q_1d;
+  CeedScalar       *C, *chebyshev_coeffs_1d_inv;
+  const CeedScalar *interp_1d, *q_ref_1d;
+  Ceed              ceed;
+
+  CeedCall(CeedBasisGetCeed(basis, &ceed));
+  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
+  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+
+  // Build coefficient matrix
+  // -- Note: Clang-tidy needs this check
+  CeedCheck((P_1d > 0) && (Q_1d > 0), ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis dimensions are malformed");
+  CeedCall(CeedCalloc(Q_1d * Q_1d, &C));
+  CeedCall(CeedBasisGetQRef(basis, &q_ref_1d));
+  for (CeedInt i = 0; i < Q_1d; i++) CeedCall(CeedChebyshevPolynomialsAtPoint(q_ref_1d[i], Q_1d, &C[i * Q_1d]));
+
+  // Compute C^+, pseudoinverse of coefficient matrix
+  CeedCall(CeedCalloc(Q_1d * Q_1d, &chebyshev_coeffs_1d_inv));
+  CeedCall(CeedMatrixPseudoinverse(ceed, C, Q_1d, Q_1d, chebyshev_coeffs_1d_inv));
+
+  // Build mapping from nodes to Chebyshev coefficients
+  CeedCall(CeedBasisGetInterp1D(basis, &interp_1d));
+  CeedCall(CeedMatrixMatrixMultiply(ceed, chebyshev_coeffs_1d_inv, interp_1d, chebyshev_interp_1d, Q_1d, P_1d, Q_1d));
+
+  // Cleanup
+  CeedCall(CeedFree(&C));
+  CeedCall(CeedFree(&chebyshev_coeffs_1d_inv));
+  CeedCall(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get tensor status for given `CeedBasis`
+
+  @param[in]  basis     `CeedBasis`
+  @param[out] is_tensor Variable to store tensor status
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor) {
+  *is_tensor = basis->is_tensor_basis;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Determine if given `CeedBasis` has nodes collocated with quadrature points
+
+  @param[in]  basis         `CeedBasis`
+  @param[out] is_collocated Variable to store collocated status
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisIsCollocated(CeedBasis basis, bool *is_collocated) {
+  if (basis->is_tensor_basis && (basis->Q_1d == basis->P_1d)) {
+    *is_collocated = true;
+
+    for (CeedInt i = 0; i < basis->P_1d; i++) {
+      *is_collocated = *is_collocated && (fabs(basis->interp_1d[i + basis->P_1d * i] - 1.0) < 10 * CEED_EPSILON);
+      for (CeedInt j = 0; j < basis->Q_1d; j++) {
+        if (j != i) *is_collocated = *is_collocated && (fabs(basis->interp_1d[j + basis->P_1d * i]) < 10 * CEED_EPSILON);
+      }
+    }
+  } else {
+    *is_collocated = false;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get backend data of a `CeedBasis`
+
+  @param[in]  basis `CeedBasis`
+  @param[out] data  Variable to store data
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisGetData(CeedBasis basis, void *data) {
+  *(void **)data = basis->data;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Set backend data of a `CeedBasis`
+
+  @param[in,out] basis  `CeedBasis`
+  @param[in]     data   Data to set
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisSetData(CeedBasis basis, void *data) {
+  basis->data = data;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Increment the reference counter for a `CeedBasis`
+
+  @param[in,out] basis `CeedBasis` to increment the reference counter
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisReference(CeedBasis basis) {
+  CeedCall(CeedObjectReference((CeedObject)basis));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get number of Q-vector components for given `CeedBasis`
+
+  @param[in]  basis     `CeedBasis`
+  @param[in]  eval_mode @ref CEED_EVAL_INTERP to use interpolated values,
+                          @ref CEED_EVAL_GRAD to use gradients,
+                          @ref CEED_EVAL_DIV to use divergence,
+                          @ref CEED_EVAL_CURL to use curl
+  @param[out] q_comp    Variable to store number of Q-vector components of basis
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedBasisGetNumQuadratureComponents(CeedBasis basis, CeedEvalMode eval_mode, CeedInt *q_comp) {
+  CeedInt dim;
+
+  CeedCall(CeedBasisGetDimension(basis, &dim));
+  switch (eval_mode) {
+    case CEED_EVAL_INTERP: {
+      CeedFESpace fe_space;
+
+      CeedCall(CeedBasisGetFESpace(basis, &fe_space));
+      *q_comp = (fe_space == CEED_FE_SPACE_H1) ? 1 : dim;
+    } break;
+    case CEED_EVAL_GRAD:
+      *q_comp = dim;
+      break;
+    case CEED_EVAL_DIV:
+      *q_comp = 1;
+      break;
+    case CEED_EVAL_CURL:
+      *q_comp = (dim < 3) ? 1 : dim;
+      break;
+    case CEED_EVAL_NONE:
+    case CEED_EVAL_WEIGHT:
+      *q_comp = 1;
+      break;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Estimate number of FLOPs required to apply `CeedBasis` in `t_mode` and `eval_mode`
+
+  @param[in]  basis        `CeedBasis` to estimate FLOPs for
+  @param[in]  t_mode       Apply basis or transpose
+  @param[in]  eval_mode    @ref CeedEvalMode
+  @param[in]  is_at_points Evaluate the basis at points or quadrature points
+  @param[in]  num_points   Number of points basis is evaluated at
+  @param[out] flops        Address of variable to hold FLOPs estimate
+
+  @ref Backend
+**/
+int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, bool is_at_points, CeedInt num_points,
+                              CeedSize *flops) {
+  bool is_tensor;
 
   CeedCall(CeedBasisIsTensor(basis, &is_tensor));
+  CeedCheck(!is_at_points || is_tensor, CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Can only evaluate tensor-product bases at points");
   if (is_tensor) {
     CeedInt dim, num_comp, P_1d, Q_1d;
 
@@ -450,32 +952,92 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva
       Q_1d = P_1d;
     }
     CeedInt tensor_flops = 0, pre = num_comp * CeedIntPow(P_1d, dim - 1), post = 1;
+
     for (CeedInt d = 0; d < dim; d++) {
       tensor_flops += 2 * pre * P_1d * post * Q_1d;
       pre /= P_1d;
       post *= Q_1d;
     }
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        *flops = 0;
-        break;
-      case CEED_EVAL_INTERP:
-        *flops = tensor_flops;
-        break;
-      case CEED_EVAL_GRAD:
-        *flops = tensor_flops * 2;
-        break;
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        // LCOV_EXCL_START
-        return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported",
-                         CeedEvalModes[eval_mode]);
-        break;
-        // LCOV_EXCL_STOP
+    if (is_at_points) {
+      bool is_gpu = false;
+
+      {
+        CeedMemType mem_type;
+
+        CeedCall(CeedGetPreferredMemType(CeedBasisReturnCeed(basis), &mem_type));
+        is_gpu = mem_type == CEED_MEM_DEVICE;
+      }
+
+      CeedInt chebyshev_flops = (Q_1d - 2) * 3 + 1, d_chebyshev_flops = (Q_1d - 2) * 8 + 1;
+      CeedInt point_tensor_flops = 0, pre = CeedIntPow(Q_1d, dim - 1), post = 1;
+
+      for (CeedInt d = 0; d < dim; d++) {
+        point_tensor_flops += 2 * pre * Q_1d * post * 1;
+        pre /= P_1d;
+        post *= Q_1d;
+      }
+
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          *flops = 0;
+          break;
+        case CEED_EVAL_INTERP: {
+          *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0));
+          if (dim == 3 && is_gpu) {
+            *flops += num_points * Q_1d *
+                      (chebyshev_flops + num_comp * (2 * chebyshev_flops + 2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 * Q_1d + 1 : 3 * Q_1d)));
+          } else {
+            *flops += num_points * (is_gpu ? num_comp : 1) * dim * chebyshev_flops;
+          }
+          break;
+        }
+        case CEED_EVAL_GRAD: {
+          *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0));
+          if (dim == 3 && is_gpu) {
+            CeedInt inner_flops =
+                dim * (2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 : 3) * Q_1d) + (dim - 1) * (2 * chebyshev_flops + d_chebyshev_flops);
+
+            *flops += num_points * Q_1d * (chebyshev_flops + d_chebyshev_flops + num_comp * (inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0)));
+          } else {
+            *flops += num_points * (is_gpu ? num_comp : 1) * dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops);
+          }
+          break;
+        }
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL: {
+          // LCOV_EXCL_START
+          return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported at points",
+                           CeedEvalModes[eval_mode]);
+          break;
+          // LCOV_EXCL_STOP
+        }
+        case CEED_EVAL_WEIGHT:
+          *flops = num_points;
+          break;
+      }
+    } else {
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          *flops = 0;
+          break;
+        case CEED_EVAL_INTERP:
+          *flops = tensor_flops;
+          break;
+        case CEED_EVAL_GRAD:
+          *flops = tensor_flops * 2;
+          break;
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL: {
+          // LCOV_EXCL_START
+          return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported",
+                           CeedEvalModes[eval_mode]);
+          break;
+          // LCOV_EXCL_STOP
+        }
+        case CEED_EVAL_WEIGHT:
+          *flops = dim * CeedIntPow(Q_1d, dim);
+          break;
       }
-      case CEED_EVAL_WEIGHT:
-        *flops = dim * CeedIntPow(Q_1d, dim);
-        break;
     }
   } else {
     CeedInt dim, num_comp, q_comp, num_nodes, num_qpts;
@@ -977,8 +1539,9 @@ int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support BasisCreateTensorH1");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateTensorH1");
     CeedCall(CeedBasisCreateTensorH1(delegate, dim, num_comp, P_1d, Q_1d, interp_1d, grad_1d, q_ref_1d, q_weight_1d, basis));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -990,8 +1553,7 @@ int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_
   CeedElemTopology topo = dim == 1 ? CEED_TOPOLOGY_LINE : dim == 2 ? CEED_TOPOLOGY_QUAD : CEED_TOPOLOGY_HEX;
 
   CeedCall(CeedCalloc(1, basis));
-  CeedCall(CeedReferenceCopy(ceed, &(*basis)->ceed));
-  (*basis)->ref_count       = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, CeedBasisDestroy_Object, &(*basis)->obj));
   (*basis)->is_tensor_basis = true;
   (*basis)->dim             = dim;
   (*basis)->topo            = topo;
@@ -1094,13 +1656,13 @@ int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, Ce
   @brief Create a non tensor-product basis for \f$H^1\f$ discretizations
 
   @param[in]  ceed      `Ceed` object used to create the `CeedBasis`
-  @param[in]  topo      Topology of element, e.g. hypercube, simplex, ect
+  @param[in]  topo      Topology of element, e.g. hypercube, simplex, etc
   @param[in]  num_comp  Number of field components (1 for scalar fields)
   @param[in]  num_nodes Total number of nodes
   @param[in]  num_qpts  Total number of quadrature points
   @param[in]  interp    Row-major (`num_qpts * num_nodes`) matrix expressing the values of nodal basis functions at quadrature points
   @param[in]  grad      Row-major (`dim * num_qpts * num_nodes`) matrix expressing derivatives of nodal basis functions at quadrature points
-  @param[in]  q_ref     Array of length `num_qpts` * dim holding the locations of quadrature points on the reference element
+  @param[in]  q_ref     Array of length `num_qpts * dim` holding the locations of quadrature points on the reference element
   @param[in]  q_weight  Array of length `num_qpts` holding the quadrature weights on the reference element
   @param[out] basis     Address of the variable where the newly created `CeedBasis` will be stored
 
@@ -1116,8 +1678,9 @@ int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedIn
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support BasisCreateH1");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateH1");
     CeedCall(CeedBasisCreateH1(delegate, topo, num_comp, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1128,8 +1691,7 @@ int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedIn
   CeedCall(CeedBasisGetTopologyDimension(topo, &dim));
 
   CeedCall(CeedCalloc(1, basis));
-  CeedCall(CeedReferenceCopy(ceed, &(*basis)->ceed));
-  (*basis)->ref_count       = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, CeedBasisDestroy_Object, &(*basis)->obj));
   (*basis)->is_tensor_basis = false;
   (*basis)->dim             = dim;
   (*basis)->topo            = topo;
@@ -1177,6 +1739,7 @@ int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Ceed
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateHdiv");
     CeedCall(CeedBasisCreateHdiv(delegate, topo, num_comp, num_nodes, num_qpts, interp, div, q_ref, q_weight, basis));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1187,8 +1750,7 @@ int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Ceed
   CeedCall(CeedBasisGetTopologyDimension(topo, &dim));
 
   CeedCall(CeedCalloc(1, basis));
-  CeedCall(CeedReferenceCopy(ceed, &(*basis)->ceed));
-  (*basis)->ref_count       = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, CeedBasisDestroy_Object, &(*basis)->obj));
   (*basis)->is_tensor_basis = false;
   (*basis)->dim             = dim;
   (*basis)->topo            = topo;
@@ -1236,6 +1798,7 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateHcurl");
     CeedCall(CeedBasisCreateHcurl(delegate, topo, num_comp, num_nodes, num_qpts, interp, curl, q_ref, q_weight, basis));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1247,8 +1810,7 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee
   curl_comp = (dim < 3) ? 1 : dim;
 
   CeedCall(CeedCalloc(1, basis));
-  CeedCall(CeedReferenceCopy(ceed, &(*basis)->ceed));
-  (*basis)->ref_count       = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedBasisView_Object, CeedBasisDestroy_Object, &(*basis)->obj));
   (*basis)->is_tensor_basis = false;
   (*basis)->dim             = dim;
   (*basis)->topo            = topo;
@@ -1281,6 +1843,8 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee
   Note: `basis_project` will have the same number of components as `basis_from`, regardless of the number of components that `basis_to` has.
         If `basis_from` has 3 components and `basis_to` has 5 components, then `basis_project` will have 3 components.
 
+  Note: If either `basis_from` or `basis_to` are non-tensor, then `basis_project` will also be non-tensor
+
   @param[in]  basis_from    `CeedBasis` to prolong from
   @param[in]  basis_to      `CeedBasis` to prolong to
   @param[out] basis_project Address of the variable where the newly created `CeedBasis` will be stored
@@ -1291,9 +1855,9 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee
 **/
 int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project) {
   Ceed        ceed;
-  bool        is_tensor;
+  bool        create_tensor;
   CeedInt     dim, num_comp;
-  CeedScalar *q_ref, *q_weight, *interp_project, *grad_project;
+  CeedScalar *interp_project, *grad_project;
 
   CeedCall(CeedBasisGetCeed(basis_to, &ceed));
 
@@ -1301,35 +1865,36 @@ int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasi
   CeedCall(CeedBasisCreateProjectionMatrices(basis_from, basis_to, &interp_project, &grad_project));
 
   // Build basis
-  CeedCall(CeedBasisIsTensor(basis_to, &is_tensor));
+  {
+    bool is_tensor_to, is_tensor_from;
+
+    CeedCall(CeedBasisIsTensor(basis_to, &is_tensor_to));
+    CeedCall(CeedBasisIsTensor(basis_from, &is_tensor_from));
+    create_tensor = is_tensor_from && is_tensor_to;
+  }
   CeedCall(CeedBasisGetDimension(basis_to, &dim));
   CeedCall(CeedBasisGetNumComponents(basis_from, &num_comp));
-  if (is_tensor) {
+  if (create_tensor) {
     CeedInt P_1d_to, P_1d_from;
 
     CeedCall(CeedBasisGetNumNodes1D(basis_from, &P_1d_from));
     CeedCall(CeedBasisGetNumNodes1D(basis_to, &P_1d_to));
-    CeedCall(CeedCalloc(P_1d_to, &q_ref));
-    CeedCall(CeedCalloc(P_1d_to, &q_weight));
-    CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d_from, P_1d_to, interp_project, grad_project, q_ref, q_weight, basis_project));
+    CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d_from, P_1d_to, interp_project, grad_project, NULL, NULL, basis_project));
   } else {
     // Even if basis_to and basis_from are not H1, the resulting basis is H1 for interpolation to work
     CeedInt          num_nodes_to, num_nodes_from;
     CeedElemTopology topo;
 
-    CeedCall(CeedBasisGetTopology(basis_to, &topo));
+    CeedCall(CeedBasisGetTopology(basis_from, &topo));
     CeedCall(CeedBasisGetNumNodes(basis_from, &num_nodes_from));
     CeedCall(CeedBasisGetNumNodes(basis_to, &num_nodes_to));
-    CeedCall(CeedCalloc(num_nodes_to * dim, &q_ref));
-    CeedCall(CeedCalloc(num_nodes_to, &q_weight));
-    CeedCall(CeedBasisCreateH1(ceed, topo, num_comp, num_nodes_from, num_nodes_to, interp_project, grad_project, q_ref, q_weight, basis_project));
+    CeedCall(CeedBasisCreateH1(ceed, topo, num_comp, num_nodes_from, num_nodes_to, interp_project, grad_project, NULL, NULL, basis_project));
   }
 
   // Cleanup
   CeedCall(CeedFree(&interp_project));
   CeedCall(CeedFree(&grad_project));
-  CeedCall(CeedFree(&q_ref));
-  CeedCall(CeedFree(&q_weight));
+  CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1353,6 +1918,36 @@ int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the number of tabs to indent for @ref CeedBasisView() output
+
+  @param[in] basis    `CeedBasis` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedBasisSetNumViewTabs(CeedBasis basis, CeedInt num_tabs) {
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)basis, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get the number of tabs to indent for @ref CeedBasisView() output
+
+  @param[in]  basis    `CeedBasis` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedBasisGetNumViewTabs(CeedBasis basis, CeedInt *num_tabs) {
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)basis, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedBasis`
 
@@ -1365,6 +1960,7 @@ int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy) {
 **/
 int CeedBasisView(CeedBasis basis, FILE *stream) {
   bool             is_tensor_basis;
+  char            *tabs = NULL;
   CeedElemTopology topo;
   CeedFESpace      fe_space;
 
@@ -1373,14 +1969,22 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
   CeedCall(CeedBasisGetTopology(basis, &topo));
   CeedCall(CeedBasisGetFESpace(basis, &fe_space));
 
+  {
+    CeedInt num_tabs = 0;
+
+    CeedCall(CeedBasisGetNumViewTabs(basis, &num_tabs));
+    CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
+  }
+
   // Print FE space and element topology of the basis
-  fprintf(stream, "CeedBasis in a %s on a %s element\n", CeedFESpaces[fe_space], CeedElemTopologies[topo]);
+  fprintf(stream, "%sCeedBasis in a %s on a %s element\n", tabs, CeedFESpaces[fe_space], CeedElemTopologies[topo]);
   if (is_tensor_basis) {
-    fprintf(stream, "  P: %" CeedInt_FMT "\n  Q: %" CeedInt_FMT "\n", basis->P_1d, basis->Q_1d);
+    fprintf(stream, "%s  P: %" CeedInt_FMT "\n%s  Q: %" CeedInt_FMT "\n", tabs, basis->P_1d, tabs, basis->Q_1d);
   } else {
-    fprintf(stream, "  P: %" CeedInt_FMT "\n  Q: %" CeedInt_FMT "\n", basis->P, basis->Q);
+    fprintf(stream, "%s  P: %" CeedInt_FMT "\n%s  Q: %" CeedInt_FMT "\n", tabs, basis->P, tabs, basis->Q);
   }
-  fprintf(stream, "  dimension: %" CeedInt_FMT "\n  field components: %" CeedInt_FMT "\n", basis->dim, basis->num_comp);
+  fprintf(stream, "%s  dimension: %" CeedInt_FMT "\n%s  field components: %" CeedInt_FMT "\n", tabs, basis->dim, tabs, basis->num_comp);
   // Print quadrature data, interpolation/gradient/divergence/curl of the basis
   if (is_tensor_basis) {  // tensor basis
     CeedInt           P_1d, Q_1d;
@@ -1393,10 +1997,10 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
     CeedCall(CeedBasisGetInterp1D(basis, &interp_1d));
     CeedCall(CeedBasisGetGrad1D(basis, &grad_1d));
 
-    CeedCall(CeedScalarView("qref1d", "\t% 12.8f", 1, Q_1d, q_ref_1d, stream));
-    CeedCall(CeedScalarView("qweight1d", "\t% 12.8f", 1, Q_1d, q_weight_1d, stream));
-    CeedCall(CeedScalarView("interp1d", "\t% 12.8f", Q_1d, P_1d, interp_1d, stream));
-    CeedCall(CeedScalarView("grad1d", "\t% 12.8f", Q_1d, P_1d, grad_1d, stream));
+    CeedCall(CeedScalarView("qref1d", "\t% 12.8f", 1, Q_1d, q_ref_1d, tabs, stream));
+    CeedCall(CeedScalarView("qweight1d", "\t% 12.8f", 1, Q_1d, q_weight_1d, tabs, stream));
+    CeedCall(CeedScalarView("interp1d", "\t% 12.8f", Q_1d, P_1d, interp_1d, tabs, stream));
+    CeedCall(CeedScalarView("grad1d", "\t% 12.8f", Q_1d, P_1d, grad_1d, tabs, stream));
   } else {  // non-tensor basis
     CeedInt           P, Q, dim, q_comp;
     const CeedScalar *q_ref, *q_weight, *interp, *grad, *div, *curl;
@@ -1411,23 +2015,24 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
     CeedCall(CeedBasisGetDiv(basis, &div));
     CeedCall(CeedBasisGetCurl(basis, &curl));
 
-    CeedCall(CeedScalarView("qref", "\t% 12.8f", 1, Q * dim, q_ref, stream));
-    CeedCall(CeedScalarView("qweight", "\t% 12.8f", 1, Q, q_weight, stream));
+    CeedCall(CeedScalarView("qref", "\t% 12.8f", 1, Q * dim, q_ref, tabs, stream));
+    CeedCall(CeedScalarView("qweight", "\t% 12.8f", 1, Q, q_weight, tabs, stream));
     CeedCall(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp));
-    CeedCall(CeedScalarView("interp", "\t% 12.8f", q_comp * Q, P, interp, stream));
+    CeedCall(CeedScalarView("interp", "\t% 12.8f", q_comp * Q, P, interp, tabs, stream));
     if (grad) {
       CeedCall(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp));
-      CeedCall(CeedScalarView("grad", "\t% 12.8f", q_comp * Q, P, grad, stream));
+      CeedCall(CeedScalarView("grad", "\t% 12.8f", q_comp * Q, P, grad, tabs, stream));
     }
     if (div) {
       CeedCall(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp));
-      CeedCall(CeedScalarView("div", "\t% 12.8f", q_comp * Q, P, div, stream));
+      CeedCall(CeedScalarView("div", "\t% 12.8f", q_comp * Q, P, div, tabs, stream));
     }
     if (curl) {
       CeedCall(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp));
-      CeedCall(CeedScalarView("curl", "\t% 12.8f", q_comp * Q, P, curl, stream));
+      CeedCall(CeedScalarView("curl", "\t% 12.8f", q_comp * Q, P, curl, tabs, stream));
     }
   }
+  CeedCall(CeedFree(&tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1453,45 +2058,38 @@ int CeedBasisView(CeedBasis basis, FILE *stream) {
   @ref User
 **/
 int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
-  CeedInt  dim, num_comp, q_comp, num_nodes, num_qpts;
-  CeedSize u_length = 0, v_length;
-  Ceed     ceed;
-
-  CeedCall(CeedBasisGetCeed(basis, &ceed));
-  CeedCall(CeedBasisGetDimension(basis, &dim));
-  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
-  CeedCall(CeedBasisGetNumNodes(basis, &num_nodes));
-  CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
-  CeedCall(CeedVectorGetLength(v, &v_length));
-  if (u) CeedCall(CeedVectorGetLength(u, &u_length));
+  CeedCall(CeedBasisApplyCheckDims(basis, num_elem, t_mode, eval_mode, u, v));
+  CeedCheck(basis->Apply, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedBasisApply");
+  CeedCall(basis->Apply(basis, num_elem, t_mode, eval_mode, u, v));
+  return CEED_ERROR_SUCCESS;
+}
 
-  CeedCheck(basis->Apply, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedBasisApply");
+/**
+  @brief Apply basis evaluation from quadrature points to nodes and sum into target vector
 
-  // Check compatibility of topological and geometrical dimensions
-  CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0 && u_length % num_qpts == 0) ||
-                (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0 && v_length % num_qpts == 0),
-            ceed, CEED_ERROR_DIMENSION, "Length of input/output vectors incompatible with basis dimensions");
+  @param[in]  basis     `CeedBasis` to evaluate
+  @param[in]  num_elem  The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  t_mode    @ref CEED_TRANSPOSE to apply the transpose, mapping from quadrature points to nodes;
+                           @ref CEED_NOTRANSPOSE is not valid for `CeedBasisApplyAdd()`
+  @param[in]  eval_mode @ref CEED_EVAL_NONE to use values directly,
+                          @ref CEED_EVAL_INTERP to use interpolated values,
+                          @ref CEED_EVAL_GRAD to use gradients,
+                          @ref CEED_EVAL_DIV to use divergence,
+                          @ref CEED_EVAL_CURL to use curl,
+                          @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  u         Input `CeedVector`
+  @param[out] v         Output `CeedVector` to sum into
 
-  // Check vector lengths to prevent out of bounds issues
-  bool has_good_dims = true;
-  switch (eval_mode) {
-    case CEED_EVAL_NONE:
-    case CEED_EVAL_INTERP:
-    case CEED_EVAL_GRAD:
-    case CEED_EVAL_DIV:
-    case CEED_EVAL_CURL:
-      has_good_dims =
-          ((t_mode == CEED_TRANSPOSE && u_length >= num_elem * num_comp * num_qpts * q_comp && v_length >= num_elem * num_comp * num_nodes) ||
-           (t_mode == CEED_NOTRANSPOSE && v_length >= num_elem * num_qpts * num_comp * q_comp && u_length >= num_elem * num_comp * num_nodes));
-      break;
-    case CEED_EVAL_WEIGHT:
-      has_good_dims = v_length >= num_elem * num_qpts;
-      break;
-  }
-  CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
+  @return An error code: 0 - success, otherwise - failure
 
-  CeedCall(basis->Apply(basis, num_elem, t_mode, eval_mode, u, v));
+  @ref User
+**/
+int CeedBasisApplyAdd(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
+  CeedCheck(t_mode == CEED_TRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "CeedBasisApplyAdd only supports CEED_TRANSPOSE");
+  CeedCall(CeedBasisApplyCheckDims(basis, num_elem, t_mode, eval_mode, u, v));
+  CeedCheck(basis->ApplyAdd, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedBasisApplyAdd");
+  CeedCall(basis->ApplyAdd(basis, num_elem, t_mode, eval_mode, u, v));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1499,7 +2097,9 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode,
   @brief Apply basis evaluation from nodes to arbitrary points
 
   @param[in]  basis      `CeedBasis` to evaluate
-  @param[in]  num_points The number of points to apply the basis evaluation to
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
   @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
                            @ref CEED_TRANSPOSE to apply the transpose, mapping from points to nodes
   @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
@@ -1513,259 +2113,45 @@ int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode,
 
   @ref User
 **/
-int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u,
-                           CeedVector v) {
-  bool     is_tensor_basis;
-  CeedInt  dim, num_comp, num_q_comp, num_nodes, P_1d = 1, Q_1d = 1;
-  CeedSize x_length = 0, u_length = 0, v_length;
-  Ceed     ceed;
-
-  CeedCall(CeedBasisGetCeed(basis, &ceed));
-  CeedCall(CeedBasisGetDimension(basis, &dim));
-  CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
-  CeedCall(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
-  CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &num_q_comp));
-  CeedCall(CeedBasisGetNumNodes(basis, &num_nodes));
-  CeedCall(CeedVectorGetLength(v, &v_length));
-  if (x_ref != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(x_ref, &x_length));
-  if (u != CEED_VECTOR_NONE) CeedCall(CeedVectorGetLength(u, &u_length));
-
-  // Check compatibility of topological and geometrical dimensions
-  CeedCheck((t_mode == CEED_TRANSPOSE && v_length % num_nodes == 0) || (t_mode == CEED_NOTRANSPOSE && u_length % num_nodes == 0) ||
-                (eval_mode == CEED_EVAL_WEIGHT),
-            ceed, CEED_ERROR_DIMENSION, "Length of input/output vectors incompatible with basis dimensions and number of points");
-
-  // Check compatibility coordinates vector
-  CeedCheck((x_length >= num_points * dim) || (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_DIMENSION,
-            "Length of reference coordinate vector incompatible with basis dimension and number of points");
-
-  // Check CEED_EVAL_WEIGHT only on CEED_NOTRANSPOSE
-  CeedCheck(eval_mode != CEED_EVAL_WEIGHT || t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_UNSUPPORTED,
-            "CEED_EVAL_WEIGHT only supported with CEED_NOTRANSPOSE");
-
-  // Check vector lengths to prevent out of bounds issues
-  bool has_good_dims = true;
-  switch (eval_mode) {
-    case CEED_EVAL_INTERP:
-      has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= num_points * num_q_comp || v_length >= num_nodes * num_comp)) ||
-                       (t_mode == CEED_NOTRANSPOSE && (v_length >= num_points * num_q_comp || u_length >= num_nodes * num_comp)));
-      break;
-    case CEED_EVAL_GRAD:
-      has_good_dims = ((t_mode == CEED_TRANSPOSE && (u_length >= num_points * num_q_comp * dim || v_length >= num_nodes * num_comp)) ||
-                       (t_mode == CEED_NOTRANSPOSE && (v_length >= num_points * num_q_comp * dim || u_length >= num_nodes * num_comp)));
-      break;
-    case CEED_EVAL_WEIGHT:
-      has_good_dims = t_mode == CEED_NOTRANSPOSE && (v_length >= num_points);
-      break;
-      // LCOV_EXCL_START
-    case CEED_EVAL_NONE:
-    case CEED_EVAL_DIV:
-    case CEED_EVAL_CURL:
-      return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points not supported for %s", CeedEvalModes[eval_mode]);
-      // LCOV_EXCL_STOP
-  }
-  CeedCheck(has_good_dims, ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode");
-
-  // Backend method
+int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                           CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCall(CeedBasisApplyAtPointsCheckDims(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
   if (basis->ApplyAtPoints) {
-    CeedCall(basis->ApplyAtPoints(basis, num_points, t_mode, eval_mode, x_ref, u, v));
-    return CEED_ERROR_SUCCESS;
-  }
-
-  // Default implementation
-  CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis));
-  CeedCheck(is_tensor_basis, ceed, CEED_ERROR_UNSUPPORTED, "Evaluation at arbitrary points only supported for tensor product bases");
-  if (eval_mode == CEED_EVAL_WEIGHT) {
-    CeedCall(CeedVectorSetValue(v, 1.0));
-    return CEED_ERROR_SUCCESS;
-  }
-  if (!basis->basis_chebyshev) {
-    // Build matrix mapping from quadrature point values to Chebyshev coefficients
-    CeedScalar       *C, *chebyshev_coeffs_1d_inv;
-    const CeedScalar *q_ref_1d;
-
-    // Build coefficient matrix
-    // -- Note: Clang-tidy needs this check because it does not understand the is_tensor_basis check above
-    CeedCheck(P_1d > 0 && Q_1d > 0, ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis dimensions are malformed");
-    CeedCall(CeedCalloc(Q_1d * Q_1d, &C));
-    CeedCall(CeedBasisGetQRef(basis, &q_ref_1d));
-    for (CeedInt i = 0; i < Q_1d; i++) CeedCall(CeedChebyshevPolynomialsAtPoint(q_ref_1d[i], Q_1d, &C[i * Q_1d]));
-
-    // Compute C^+, pseudoinverse of coefficient matrix
-    CeedCall(CeedCalloc(Q_1d * Q_1d, &chebyshev_coeffs_1d_inv));
-    CeedCall(CeedMatrixPseudoinverse(ceed, C, Q_1d, Q_1d, chebyshev_coeffs_1d_inv));
-
-    // Build basis mapping from nodes to Chebyshev coefficients
-    CeedScalar       *chebyshev_interp_1d, *chebyshev_grad_1d, *chebyshev_q_weight_1d;
-    const CeedScalar *interp_1d;
-
-    CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
-    CeedCall(CeedCalloc(P_1d * Q_1d, &chebyshev_grad_1d));
-    CeedCall(CeedCalloc(Q_1d, &chebyshev_q_weight_1d));
-    CeedCall(CeedBasisGetInterp1D(basis, &interp_1d));
-    CeedCall(CeedMatrixMatrixMultiply(ceed, chebyshev_coeffs_1d_inv, interp_1d, chebyshev_interp_1d, Q_1d, P_1d, Q_1d));
-
-    CeedCall(CeedVectorCreate(ceed, num_comp * CeedIntPow(Q_1d, dim), &basis->vec_chebyshev));
-    CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d, Q_1d, chebyshev_interp_1d, chebyshev_grad_1d, q_ref_1d, chebyshev_q_weight_1d,
-                                     &basis->basis_chebyshev));
-
-    // Cleanup
-    CeedCall(CeedFree(&C));
-    CeedCall(CeedFree(&chebyshev_coeffs_1d_inv));
-    CeedCall(CeedFree(&chebyshev_interp_1d));
-    CeedCall(CeedFree(&chebyshev_grad_1d));
-    CeedCall(CeedFree(&chebyshev_q_weight_1d));
-  }
-
-  // Create TensorContract object if needed, such as a basis from the GPU backends
-  if (!basis->contract) {
-    Ceed      ceed_ref;
-    CeedBasis basis_ref = NULL;
-
-    CeedCall(CeedInit("/cpu/self", &ceed_ref));
-    // Only need matching tensor contraction dimensions, any type of basis will work
-    CeedCall(CeedBasisCreateTensorH1Lagrange(ceed_ref, dim, num_comp, P_1d, Q_1d, CEED_GAUSS, &basis_ref));
-    // Note - clang-tidy doesn't know basis_ref->contract must be valid here
-    CeedCheck(basis_ref && basis_ref->contract, ceed, CEED_ERROR_UNSUPPORTED, "Reference CPU ceed failed to create a tensor contraction object");
-    CeedCall(CeedTensorContractReferenceCopy(basis_ref->contract, &basis->contract));
-    CeedCall(CeedBasisDestroy(&basis_ref));
-    CeedCall(CeedDestroy(&ceed_ref));
+    CeedCall(basis->ApplyAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  } else {
+    CeedCall(CeedBasisApplyAtPoints_Core(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
   }
+  return CEED_ERROR_SUCCESS;
+}
 
-  // Basis evaluation
-  switch (t_mode) {
-    case CEED_NOTRANSPOSE: {
-      // Nodes to arbitrary points
-      CeedScalar       *v_array;
-      const CeedScalar *chebyshev_coeffs, *x_array_read;
-
-      // -- Interpolate to Chebyshev coefficients
-      CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, basis->vec_chebyshev));
-
-      // -- Evaluate Chebyshev polynomials at arbitrary points
-      CeedCall(CeedVectorGetArrayRead(basis->vec_chebyshev, CEED_MEM_HOST, &chebyshev_coeffs));
-      CeedCall(CeedVectorGetArrayRead(x_ref, CEED_MEM_HOST, &x_array_read));
-      CeedCall(CeedVectorGetArrayWrite(v, CEED_MEM_HOST, &v_array));
-      switch (eval_mode) {
-        case CEED_EVAL_INTERP: {
-          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
-
-          // ---- Values at point
-          for (CeedInt p = 0; p < num_points; p++) {
-            CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1;
-
-            for (CeedInt d = 0; d < dim; d++) {
-              // ------ Tensor contract with current Chebyshev polynomial values
-              CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x));
-              CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false,
-                                               d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2]));
-              pre /= Q_1d;
-              post *= 1;
-            }
-            for (CeedInt c = 0; c < num_comp; c++) v_array[c * num_points + p] = tmp[dim % 2][c];
-          }
-          break;
-        }
-        case CEED_EVAL_GRAD: {
-          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
-
-          // ---- Values at point
-          for (CeedInt p = 0; p < num_points; p++) {
-            // Dim**2 contractions, apply grad when pass == dim
-            for (CeedInt pass = 0; pass < dim; pass++) {
-              CeedInt pre = num_comp * CeedIntPow(Q_1d, dim - 1), post = 1;
-
-              for (CeedInt d = 0; d < dim; d++) {
-                // ------ Tensor contract with current Chebyshev polynomial values
-                if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x));
-                else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x));
-                CeedCall(CeedTensorContractApply(basis->contract, pre, Q_1d, post, 1, chebyshev_x, t_mode, false,
-                                                 d == 0 ? chebyshev_coeffs : tmp[d % 2], tmp[(d + 1) % 2]));
-                pre /= Q_1d;
-                post *= 1;
-              }
-              for (CeedInt c = 0; c < num_comp; c++) v_array[(pass * num_comp + c) * num_points + p] = tmp[dim % 2][c];
-            }
-          }
-          break;
-        }
-        default:
-          // Nothing to do, excluded above
-          break;
-      }
-      CeedCall(CeedVectorRestoreArrayRead(basis->vec_chebyshev, &chebyshev_coeffs));
-      CeedCall(CeedVectorRestoreArrayRead(x_ref, &x_array_read));
-      CeedCall(CeedVectorRestoreArray(v, &v_array));
-      break;
-    }
-    case CEED_TRANSPOSE: {
-      // Note: No switch on e_mode here because only CEED_EVAL_INTERP is supported at this time
-      // Arbitrary points to nodes
-      CeedScalar       *chebyshev_coeffs;
-      const CeedScalar *u_array, *x_array_read;
-
-      // -- Transpose of evaluation of Chebyshev polynomials at arbitrary points
-      CeedCall(CeedVectorGetArrayWrite(basis->vec_chebyshev, CEED_MEM_HOST, &chebyshev_coeffs));
-      CeedCall(CeedVectorGetArrayRead(x_ref, CEED_MEM_HOST, &x_array_read));
-      CeedCall(CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array));
-
-      switch (eval_mode) {
-        case CEED_EVAL_INTERP: {
-          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
-
-          // ---- Values at point
-          for (CeedInt p = 0; p < num_points; p++) {
-            CeedInt pre = num_comp * 1, post = 1;
-
-            for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[c * num_points + p];
-            for (CeedInt d = 0; d < dim; d++) {
-              // ------ Tensor contract with current Chebyshev polynomial values
-              CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x));
-              CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode, p > 0 && d == (dim - 1), tmp[d % 2],
-                                               d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2]));
-              pre /= 1;
-              post *= Q_1d;
-            }
-          }
-          break;
-        }
-        case CEED_EVAL_GRAD: {
-          CeedScalar tmp[2][num_comp * CeedIntPow(Q_1d, dim)], chebyshev_x[Q_1d];
+/**
+  @brief Apply basis evaluation from nodes to arbitrary points and sum into target vector
 
-          // ---- Values at point
-          for (CeedInt p = 0; p < num_points; p++) {
-            // Dim**2 contractions, apply grad when pass == dim
-            for (CeedInt pass = 0; pass < dim; pass++) {
-              CeedInt pre = num_comp * 1, post = 1;
+  @param[in]  basis      `CeedBasis` to evaluate
+  @param[in]  num_elem   The number of elements to apply the basis evaluation to;
+                          the backend will specify the ordering in @ref CeedElemRestrictionCreate()
+  @param[in]  num_points Array of the number of points to apply the basis evaluation to in each element, size `num_elem`
+  @param[in]  t_mode     @ref CEED_NOTRANSPOSE to evaluate from nodes to points;
+                           @ref CEED_NOTRANSPOSE is not valid for `CeedBasisApplyAddAtPoints()`
+  @param[in]  eval_mode  @ref CEED_EVAL_INTERP to use interpolated values,
+                           @ref CEED_EVAL_GRAD to use gradients,
+                           @ref CEED_EVAL_WEIGHT to use quadrature weights
+  @param[in]  x_ref      `CeedVector` holding reference coordinates of each point
+  @param[in]  u          Input `CeedVector`, of length `num_nodes * num_comp` for @ref CEED_NOTRANSPOSE
+  @param[out] v          Output `CeedVector`, of length `num_points * num_q_comp` for @ref CEED_NOTRANSPOSE with @ref CEED_EVAL_INTERP
 
-              for (CeedInt c = 0; c < num_comp; c++) tmp[0][c] = u_array[(pass * num_comp + c) * num_points + p];
-              for (CeedInt d = 0; d < dim; d++) {
-                // ------ Tensor contract with current Chebyshev polynomial values
-                if (pass == d) CeedCall(CeedChebyshevDerivativeAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x));
-                else CeedCall(CeedChebyshevPolynomialsAtPoint(x_array_read[d * num_points + p], Q_1d, chebyshev_x));
-                CeedCall(CeedTensorContractApply(basis->contract, pre, 1, post, Q_1d, chebyshev_x, t_mode,
-                                                 (p > 0 || (p == 0 && pass > 0)) && d == (dim - 1), tmp[d % 2],
-                                                 d == (dim - 1) ? chebyshev_coeffs : tmp[(d + 1) % 2]));
-                pre /= 1;
-                post *= Q_1d;
-              }
-            }
-          }
-          break;
-        }
-        default:
-          // Nothing to do, excluded above
-          break;
-      }
-      CeedCall(CeedVectorRestoreArray(basis->vec_chebyshev, &chebyshev_coeffs));
-      CeedCall(CeedVectorRestoreArrayRead(x_ref, &x_array_read));
-      CeedCall(CeedVectorRestoreArrayRead(u, &u_array));
+  @return An error code: 0 - success, otherwise - failure
 
-      // -- Interpolate transpose from Chebyshev coefficients
-      CeedCall(CeedBasisApply(basis->basis_chebyshev, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, basis->vec_chebyshev, v));
-      break;
-    }
+  @ref User
+**/
+int CeedBasisApplyAddAtPoints(CeedBasis basis, CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                              CeedVector x_ref, CeedVector u, CeedVector v) {
+  CeedCheck(t_mode == CEED_TRANSPOSE, CeedBasisReturnCeed(basis), CEED_ERROR_UNSUPPORTED, "CeedBasisApplyAddAtPoints only supports CEED_TRANSPOSE");
+  CeedCall(CeedBasisApplyAtPointsCheckDims(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  if (basis->ApplyAddAtPoints) {
+    CeedCall(basis->ApplyAddAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
+  } else {
+    CeedCall(CeedBasisApplyAtPoints_Core(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1781,20 +2167,20 @@ int CeedBasisApplyAtPoints(CeedBasis basis, CeedInt num_points, CeedTransposeMod
   @ref Advanced
 **/
 int CeedBasisGetCeed(CeedBasis basis, Ceed *ceed) {
-  *ceed = CeedBasisReturnCeed(basis);
+  CeedCall(CeedObjectGetCeed((CeedObject)basis, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
 /**
   @brief Return the `Ceed` associated with a `CeedBasis`
 
-  @param[in]  basis `CeedBasis`
+  @param[in] basis `CeedBasis`
 
   @return `Ceed` associated with the `basis`
 
   @ref Advanced
 **/
-Ceed CeedBasisReturnCeed(CeedBasis basis) { return basis->ceed; }
+Ceed CeedBasisReturnCeed(CeedBasis basis) { return CeedObjectReturnCeed((CeedObject)basis); }
 
 /**
   @brief Get dimension for given `CeedBasis`
@@ -2073,7 +2459,7 @@ int CeedBasisGetCurl(CeedBasis basis, const CeedScalar **curl) {
 }
 
 /**
-  @brief Destroy a @ref  CeedBasis
+  @brief Destroy a @ref CeedBasis
 
   @param[in,out] basis `CeedBasis` to destroy
 
@@ -2082,7 +2468,7 @@ int CeedBasisGetCurl(CeedBasis basis, const CeedScalar **curl) {
   @ref User
 **/
 int CeedBasisDestroy(CeedBasis *basis) {
-  if (!*basis || *basis == CEED_BASIS_NONE || --(*basis)->ref_count > 0) {
+  if (!*basis || *basis == CEED_BASIS_NONE || CeedObjectDereference((CeedObject)*basis) > 0) {
     *basis = NULL;
     return CEED_ERROR_SUCCESS;
   }
@@ -2098,7 +2484,7 @@ int CeedBasisDestroy(CeedBasis *basis) {
   CeedCall(CeedFree(&(*basis)->curl));
   CeedCall(CeedVectorDestroy(&(*basis)->vec_chebyshev));
   CeedCall(CeedBasisDestroy(&(*basis)->basis_chebyshev));
-  CeedCall(CeedDestroy(&(*basis)->ceed));
+  CeedCall(CeedObjectDestroy_Private(&(*basis)->obj));
   CeedCall(CeedFree(basis));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-config.c b/interface/ceed-config.c
new file mode 100644
index 0000000000..37ae708ec7
--- /dev/null
+++ b/interface/ceed-config.c
@@ -0,0 +1,90 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed-impl.h>
+
+const char *CeedGitVersion         = CEED_GIT_VERSION;
+const char *CeedBuildConfiguration = CEED_BUILD_CONFIGURATION;
+
+/// @addtogroup CeedUser
+/// @{
+
+/**
+  @brief Get output of `git describe --dirty` from build time.
+
+  While @ref CeedGetVersion() uniquely identifies the source code for release
+  builds, it does not identify builds from other commits.
+
+  @param[out] git_version A static string containing the Git commit description.
+
+  If `git describe --always --dirty` fails, the string `"unknown"` will be provided.
+  This could occur if Git is not installed or if libCEED is not being built from a repository, for example.`
+
+  @ref Developer
+
+  @sa CeedGetVersion() CeedGetBuildConfiguration()
+
+  @return An error code: 0 - success, otherwise - failure
+*/
+int CeedGetGitVersion(const char **git_version) {
+  *git_version = CeedGitVersion;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Set whether or not to use Clang when compiling for GPU (instead of nvrtc)
+
+  @param[in,out]  ceed     `Ceed` context to set Clang GPU compilation flag
+  @param[in]      is_clang Flag to use clang for GPU compilation
+
+  @ref Developer
+
+  @sa CeedGetIsClang()
+
+  @return An error code: 0 - success, otherwise - failure
+ */
+int CeedSetIsClang(Ceed ceed, bool is_clang) {
+  ceed->cuda_compile_with_clang = is_clang;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Determine if the current `ceed` is set to compile with Clang for CPU
+
+  @param[in]  ceed     `Ceed` context to get Clang GPU compilation flag
+  @param[out] is_clang Variable to store Clang GPU compilation flag
+
+  @ref Developer
+
+  @sa CeedSetIsClang()
+
+  @return An error code: 0 - success, otherwise - failure
+ */
+int CeedGetIsClang(Ceed ceed, bool *is_clang) {
+  *is_clang = ceed->cuda_compile_with_clang;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get build variables as a multi-line string.
+
+  Each line of the string has the format `VARNAME = value`.
+
+  @param[out] build_config A static string containing build variables
+
+  @ref Developer
+
+  @sa CeedGetVersion() CeedGetGitVersion()
+
+  @return An error code: 0 - success, otherwise - failure
+*/
+int CeedGetBuildConfiguration(const char **build_config) {
+  *build_config = CeedBuildConfiguration;
+  return CEED_ERROR_SUCCESS;
+}
+
+/// @}
diff --git a/interface/ceed-cuda.c b/interface/ceed-cuda.c
index c4463b738d..ea15d46735 100644
--- a/interface/ceed-cuda.c
+++ b/interface/ceed-cuda.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -23,10 +23,7 @@
 **/
 int CeedQFunctionSetCUDAUserFunction(CeedQFunction qf, CUfunction f) {
   if (!qf->SetCUDAUserFunction) {
-    Ceed ceed;
-
-    CeedCall(CeedQFunctionGetCeed(qf, &ceed));
-    CeedDebug(ceed, "Backend does not support CUfunction pointers for QFunctions.");
+    CeedDebug(CeedQFunctionReturnCeed(qf), "Backend does not support CUfunction pointers for QFunctions.");
   } else {
     CeedCall(qf->SetCUDAUserFunction(qf, f));
   }
diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c
index e687c0daed..476daab0c2 100644
--- a/interface/ceed-elemrestriction.c
+++ b/interface/ceed-elemrestriction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -98,6 +98,35 @@ int CeedPermutePadCurlOrients(const CeedInt8 *curl_orients, CeedInt8 *block_curl
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief View a `CeedElemRestriction` passed as a `CeedObject`
+
+  @param[in] rstr   `CeedElemRestriction` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedElemRestrictionView_Object(CeedObject rstr, FILE *stream) {
+  CeedCall(CeedElemRestrictionView((CeedElemRestriction)rstr, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Destroy a `CeedElemRestricton` passed as a `CeedObject`
+
+  @param[in,out] rstr Address of `CeedElemRestriction` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedElemRestrictionDestroy_Object(CeedObject *rstr) {
+  CeedCall(CeedElemRestrictionDestroy((CeedElemRestriction *)rstr));
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -146,7 +175,7 @@ int CeedElemRestrictionIsStrided(CeedElemRestriction rstr, bool *is_strided) {
 
   @ref Backend
 **/
-int CeedElemRestrictionIsPoints(CeedElemRestriction rstr, bool *is_points) {
+int CeedElemRestrictionIsAtPoints(CeedElemRestriction rstr, bool *is_points) {
   *is_points = (rstr->rstr_type == CEED_RESTRICTION_POINTS);
   return CEED_ERROR_SUCCESS;
 }
@@ -164,13 +193,12 @@ int CeedElemRestrictionIsPoints(CeedElemRestriction rstr, bool *is_points) {
 **/
 int CeedElemRestrictionAtPointsAreCompatible(CeedElemRestriction rstr_a, CeedElemRestriction rstr_b, bool *are_compatible) {
   CeedInt num_elem_a, num_elem_b, num_points_a, num_points_b;
-  Ceed    ceed;
-
-  CeedCall(CeedElemRestrictionGetCeed(rstr_a, &ceed));
 
   // Cannot compare non-points restrictions
-  CeedCheck(rstr_a->rstr_type == CEED_RESTRICTION_POINTS, ceed, CEED_ERROR_UNSUPPORTED, "First CeedElemRestriction must be AtPoints");
-  CeedCheck(rstr_b->rstr_type == CEED_RESTRICTION_POINTS, ceed, CEED_ERROR_UNSUPPORTED, "Second CeedElemRestriction must be AtPoints");
+  CeedCheck(rstr_a->rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr_a), CEED_ERROR_UNSUPPORTED,
+            "First CeedElemRestriction must be AtPoints");
+  CeedCheck(rstr_b->rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr_a), CEED_ERROR_UNSUPPORTED,
+            "Second CeedElemRestriction must be AtPoints");
 
   CeedCall(CeedElemRestrictionGetNumElements(rstr_a, &num_elem_a));
   CeedCall(CeedElemRestrictionGetNumElements(rstr_b, &num_elem_b));
@@ -243,7 +271,7 @@ int CeedElemRestrictionGetOffsets(CeedElemRestriction rstr, CeedMemType mem_type
     CeedCall(CeedElemRestrictionGetOffsets(rstr->rstr_base, mem_type, offsets));
   } else {
     CeedCheck(rstr->GetOffsets, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED,
-              "Backend does not support CeedElemRestrictionGetOffsets");
+              "Backend does not implement CeedElemRestrictionGetOffsets");
     CeedCall(rstr->GetOffsets(rstr, mem_type, offsets));
     rstr->num_readers++;
   }
@@ -284,7 +312,7 @@ int CeedElemRestrictionRestoreOffsets(CeedElemRestriction rstr, const CeedInt **
 **/
 int CeedElemRestrictionGetOrientations(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) {
   CeedCheck(rstr->GetOrientations, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED,
-            "Backend does not support CeedElemRestrictionGetOrientations");
+            "Backend does not implement CeedElemRestrictionGetOrientations");
   CeedCall(rstr->GetOrientations(rstr, mem_type, orients));
   rstr->num_readers++;
   return CEED_ERROR_SUCCESS;
@@ -320,7 +348,7 @@ int CeedElemRestrictionRestoreOrientations(CeedElemRestriction rstr, const bool
 **/
 int CeedElemRestrictionGetCurlOrientations(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) {
   CeedCheck(rstr->GetCurlOrientations, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED,
-            "Backend does not support CeedElemRestrictionGetCurlOrientations");
+            "Backend does not implement CeedElemRestrictionGetCurlOrientations");
   CeedCall(rstr->GetCurlOrientations(rstr, mem_type, curl_orients));
   rstr->num_readers++;
   return CEED_ERROR_SUCCESS;
@@ -357,14 +385,13 @@ int CeedElemRestrictionRestoreCurlOrientations(CeedElemRestriction rstr, const C
 int CeedElemRestrictionGetLLayout(CeedElemRestriction rstr, CeedInt layout[3]) {
   bool                has_backend_strides;
   CeedRestrictionType rstr_type;
-  Ceed                ceed;
 
-  CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
-  CeedCheck(rstr_type == CEED_RESTRICTION_STRIDED, ceed, CEED_ERROR_MINOR, "Only strided CeedElemRestriction have strided L-vector layout");
+  CeedCheck(rstr_type == CEED_RESTRICTION_STRIDED, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_MINOR,
+            "Only strided CeedElemRestriction have strided L-vector layout");
   CeedCall(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
   if (has_backend_strides) {
-    CeedCheck(rstr->l_layout[0], ceed, CEED_ERROR_MINOR, "CeedElemRestriction has no L-vector layout data");
+    CeedCheck(rstr->l_layout[0], CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_MINOR, "CeedElemRestriction has no L-vector layout data");
     for (CeedInt i = 0; i < 3; i++) layout[i] = rstr->l_layout[i];
   } else {
     CeedCall(CeedElemRestrictionGetStrides(rstr, layout));
@@ -429,6 +456,70 @@ int CeedElemRestrictionSetELayout(CeedElemRestriction rstr, CeedInt layout[3]) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+
+  @brief Get the E-vector element offset of a `CeedElemRestriction` at points
+
+  @param[in]  rstr        `CeedElemRestriction`
+  @param[in]  elem        Element number index into E-vector for
+  @param[out] elem_offset Offset for element `elem` in the E-vector.
+                            The data for point `i`, component `j`, element `elem` in the E-vector is given by `i*e_layout[0] + j*e_layout[1] + elem_offset`.
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedElemRestrictionGetAtPointsElementOffset(CeedElemRestriction rstr, CeedInt elem, CeedSize *elem_offset) {
+  CeedInt             num_comp;
+  CeedRestrictionType rstr_type;
+
+  CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
+  CeedCheck(rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_INCOMPATIBLE,
+            "Can only compute offset for a points CeedElemRestriction");
+
+  // Backend method
+  if (rstr->GetAtPointsElementOffset) {
+    CeedCall(rstr->GetAtPointsElementOffset(rstr, elem, elem_offset));
+    return CEED_ERROR_SUCCESS;
+  }
+
+  // Default layout (CPU)
+  *elem_offset = 0;
+  CeedCall(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
+  for (CeedInt i = 0; i < elem; i++) {
+    CeedInt num_points;
+
+    CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr, i, &num_points));
+    *elem_offset += num_points * num_comp;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+
+  @brief Set the E-vector size of a `CeedElemRestriction` at points
+
+  @param[in,out]  rstr   `CeedElemRestriction`
+  @param[in]      e_size New E-vector size; must be longer than the current E-vector size
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedElemRestrictionSetAtPointsEVectorSize(CeedElemRestriction rstr, CeedSize e_size) {
+  CeedRestrictionType rstr_type;
+
+  CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
+  CeedCheck(rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_INCOMPATIBLE,
+            "Can only compute offset for a points CeedElemRestriction");
+  CeedCheck(e_size >= rstr->e_size, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_INCOMPATIBLE,
+            "Can only increase the size of the E-vector for the CeedElemRestriction."
+            " Current size: %" CeedSize_FMT " New size: %" CeedSize_FMT,
+            rstr->e_size, e_size);
+  rstr->e_size = e_size;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get the backend data of a `CeedElemRestriction`
 
@@ -469,7 +560,7 @@ int CeedElemRestrictionSetData(CeedElemRestriction rstr, void *data) {
   @ref Backend
 **/
 int CeedElemRestrictionReference(CeedElemRestriction rstr) {
-  rstr->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)rstr));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -571,6 +662,7 @@ int CeedElemRestrictionCreate(Ceed ceed, CeedInt num_elem, CeedInt elem_size, Ce
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreate");
     CeedCall(CeedElemRestrictionCreate(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -580,8 +672,7 @@ int CeedElemRestrictionCreate(Ceed ceed, CeedInt num_elem, CeedInt elem_size, Ce
   CeedCheck(num_comp == 1 || comp_stride > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction component stride must be at least 1");
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -626,8 +717,9 @@ int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, CeedInt elem_
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateOriented");
-    CeedCall(
-        CeedElemRestrictionCreateOriented(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, orients, rstr));
+    CeedCall(CeedElemRestrictionCreateOriented(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, orients,
+                                               rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -637,8 +729,7 @@ int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, CeedInt elem_
   CeedCheck(num_comp == 1 || comp_stride > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction component stride must be at least 1");
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -686,6 +777,7 @@ int CeedElemRestrictionCreateCurlOriented(Ceed ceed, CeedInt num_elem, CeedInt e
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateCurlOriented");
     CeedCall(CeedElemRestrictionCreateCurlOriented(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets,
                                                    curl_orients, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -695,8 +787,7 @@ int CeedElemRestrictionCreateCurlOriented(Ceed ceed, CeedInt num_elem, CeedInt e
   CeedCheck(num_comp == 1 || comp_stride > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction component stride must be at least 1");
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -738,18 +829,19 @@ int CeedElemRestrictionCreateStrided(Ceed ceed, CeedInt num_elem, CeedInt elem_s
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateStrided");
     CeedCall(CeedElemRestrictionCreateStrided(delegate, num_elem, elem_size, num_comp, l_size, strides, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
   CeedCheck(num_elem >= 0, ceed, CEED_ERROR_DIMENSION, "Number of elements must be non-negative");
   CeedCheck(elem_size > 0, ceed, CEED_ERROR_DIMENSION, "Element size must be at least 1");
   CeedCheck(num_comp > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction must have at least 1 component");
-  CeedCheck(l_size >= (CeedSize)num_elem * elem_size * num_comp, ceed, CEED_ERROR_DIMENSION,
-            "L-vector size must be at least num_elem * elem_size * num_comp");
+  CeedCheck(l_size >= (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, ceed, CEED_ERROR_DIMENSION,
+            "L-vector size must be at least num_elem * elem_size * num_comp. Expected: > %" CeedSize_FMT " Found: %" CeedSize_FMT,
+            (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, l_size);
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count  = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem   = num_elem;
   (*rstr)->elem_size  = elem_size;
   (*rstr)->num_comp   = num_comp;
@@ -804,25 +896,28 @@ int CeedElemRestrictionCreateAtPoints(Ceed ceed, CeedInt num_elem, CeedInt num_p
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateAtPoints");
     CeedCall(CeedElemRestrictionCreateAtPoints(delegate, num_elem, num_points, num_comp, l_size, mem_type, copy_mode, offsets, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
   CeedCheck(num_elem >= 0, ceed, CEED_ERROR_DIMENSION, "Number of elements must be non-negative");
   CeedCheck(num_points >= 0, ceed, CEED_ERROR_DIMENSION, "Number of points must be non-negative");
   CeedCheck(num_comp > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction must have at least 1 component");
-  CeedCheck(l_size >= (CeedSize)num_points * num_comp, ceed, CEED_ERROR_DIMENSION, "L-vector must be at least num_points * num_comp");
+  CeedCheck(l_size >= (CeedSize)num_points * num_comp, ceed, CEED_ERROR_DIMENSION,
+            "L-vector must be at least num_points * num_comp. Expected: > %" CeedSize_FMT " Found: %" CeedSize_FMT, (CeedSize)num_points * num_comp,
+            l_size);
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count  = 1;
-  (*rstr)->num_elem   = num_elem;
-  (*rstr)->num_points = num_points;
-  (*rstr)->num_comp   = num_comp;
-  (*rstr)->l_size     = l_size;
-  (*rstr)->e_size     = (CeedSize)num_points * (CeedSize)num_comp;
-  (*rstr)->num_block  = num_elem;
-  (*rstr)->block_size = 1;
-  (*rstr)->rstr_type  = CEED_RESTRICTION_POINTS;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
+  (*rstr)->num_elem    = num_elem;
+  (*rstr)->num_points  = num_points;
+  (*rstr)->num_comp    = num_comp;
+  (*rstr)->comp_stride = 1;
+  (*rstr)->l_size      = l_size;
+  (*rstr)->e_size      = (CeedSize)num_points * (CeedSize)num_comp;
+  (*rstr)->num_block   = num_elem;
+  (*rstr)->block_size  = 1;
+  (*rstr)->rstr_type   = CEED_RESTRICTION_POINTS;
   CeedCall(ceed->ElemRestrictionCreateAtPoints(mem_type, copy_mode, offsets, NULL, NULL, *rstr));
   return CEED_ERROR_SUCCESS;
 }
@@ -864,6 +959,7 @@ int CeedElemRestrictionCreateBlocked(Ceed ceed, CeedInt num_elem, CeedInt elem_s
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateBlocked");
     CeedCall(CeedElemRestrictionCreateBlocked(delegate, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets,
                                               rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -877,8 +973,7 @@ int CeedElemRestrictionCreateBlocked(Ceed ceed, CeedInt num_elem, CeedInt elem_s
   CeedCall(CeedPermutePadOffsets(offsets, block_offsets, num_block, num_elem, block_size, elem_size));
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -933,6 +1028,7 @@ int CeedElemRestrictionCreateBlockedOriented(Ceed ceed, CeedInt num_elem, CeedIn
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateBlockedOriented");
     CeedCall(CeedElemRestrictionCreateBlockedOriented(delegate, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, mem_type, copy_mode,
                                                       offsets, orients, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -947,8 +1043,7 @@ int CeedElemRestrictionCreateBlockedOriented(Ceed ceed, CeedInt num_elem, CeedIn
   CeedCall(CeedPermutePadOrients(orients, block_orients, num_block, num_elem, block_size, elem_size));
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -958,8 +1053,8 @@ int CeedElemRestrictionCreateBlockedOriented(Ceed ceed, CeedInt num_elem, CeedIn
   (*rstr)->num_block   = num_block;
   (*rstr)->block_size  = block_size;
   (*rstr)->rstr_type   = CEED_RESTRICTION_ORIENTED;
-  CeedCall(
-      ceed->ElemRestrictionCreateBlocked(CEED_MEM_HOST, CEED_OWN_POINTER, (const CeedInt *)block_offsets, (const bool *)block_orients, NULL, *rstr));
+  CeedCall(ceed->ElemRestrictionCreateBlocked(CEED_MEM_HOST, CEED_OWN_POINTER, (const CeedInt *)block_offsets, (const bool *)block_orients, NULL,
+                                              *rstr));
   if (copy_mode == CEED_OWN_POINTER) CeedCall(CeedFree(&offsets));
   return CEED_ERROR_SUCCESS;
 }
@@ -1005,6 +1100,7 @@ int CeedElemRestrictionCreateBlockedCurlOriented(Ceed ceed, CeedInt num_elem, Ce
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateBlockedCurlOriented");
     CeedCall(CeedElemRestrictionCreateBlockedCurlOriented(delegate, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, mem_type,
                                                           copy_mode, offsets, curl_orients, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1020,8 +1116,7 @@ int CeedElemRestrictionCreateBlockedCurlOriented(Ceed ceed, CeedInt num_elem, Ce
   CeedCall(CeedPermutePadCurlOrients(curl_orients, block_curl_orients, num_block, num_elem, block_size, 3 * elem_size));
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem    = num_elem;
   (*rstr)->elem_size   = elem_size;
   (*rstr)->num_comp    = num_comp;
@@ -1066,6 +1161,7 @@ int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, CeedInt
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"));
     CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionCreateBlockedStrided");
     CeedCall(CeedElemRestrictionCreateBlockedStrided(delegate, num_elem, elem_size, block_size, num_comp, l_size, strides, rstr));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -1073,12 +1169,12 @@ int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, CeedInt
   CeedCheck(elem_size > 0, ceed, CEED_ERROR_DIMENSION, "Element size must be at least 1");
   CeedCheck(block_size > 0, ceed, CEED_ERROR_DIMENSION, "Block size must be at least 1");
   CeedCheck(num_comp > 0, ceed, CEED_ERROR_DIMENSION, "CeedElemRestriction must have at least 1 component");
-  CeedCheck(l_size >= (CeedSize)num_elem * elem_size * num_comp, ceed, CEED_ERROR_DIMENSION,
-            "L-vector size must be at least num_elem * elem_size * num_comp");
+  CeedCheck(l_size >= (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, ceed, CEED_ERROR_DIMENSION,
+            "L-vector size must be at least num_elem * elem_size * num_comp. Expected: > %" CeedSize_FMT " Found: %" CeedSize_FMT,
+            (CeedSize)num_elem * (CeedSize)elem_size * (CeedSize)num_comp, l_size);
 
   CeedCall(CeedCalloc(1, rstr));
-  CeedCall(CeedReferenceCopy(ceed, &(*rstr)->ceed));
-  (*rstr)->ref_count  = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object, &(*rstr)->obj));
   (*rstr)->num_elem   = num_elem;
   (*rstr)->elem_size  = elem_size;
   (*rstr)->num_comp   = num_comp;
@@ -1110,10 +1206,9 @@ int CeedElemRestrictionCreateUnsignedCopy(CeedElemRestriction rstr, CeedElemRest
 
   // Copy old rstr
   memcpy(*rstr_unsigned, rstr, sizeof(struct CeedElemRestriction_private));
-  (*rstr_unsigned)->ceed = NULL;
-  CeedCall(CeedReferenceCopy(rstr->ceed, &(*rstr_unsigned)->ceed));
-  (*rstr_unsigned)->ref_count = 1;
-  (*rstr_unsigned)->strides   = NULL;
+  CeedCall(CeedObjectCreate(CeedElemRestrictionReturnCeed(rstr), CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object,
+                            &(*rstr_unsigned)->obj));
+  (*rstr_unsigned)->strides = NULL;
   if (rstr->strides) {
     CeedCall(CeedMalloc(3, &(*rstr_unsigned)->strides));
     for (CeedInt i = 0; i < 3; i++) (*rstr_unsigned)->strides[i] = rstr->strides[i];
@@ -1142,10 +1237,9 @@ int CeedElemRestrictionCreateUnorientedCopy(CeedElemRestriction rstr, CeedElemRe
 
   // Copy old rstr
   memcpy(*rstr_unoriented, rstr, sizeof(struct CeedElemRestriction_private));
-  (*rstr_unoriented)->ceed = NULL;
-  CeedCall(CeedReferenceCopy(rstr->ceed, &(*rstr_unoriented)->ceed));
-  (*rstr_unoriented)->ref_count = 1;
-  (*rstr_unoriented)->strides   = NULL;
+  CeedCall(CeedObjectCreate(CeedElemRestrictionReturnCeed(rstr), CeedElemRestrictionView_Object, CeedElemRestrictionDestroy_Object,
+                            &(*rstr_unoriented)->obj));
+  (*rstr_unoriented)->strides = NULL;
   if (rstr->strides) {
     CeedCall(CeedMalloc(3, &(*rstr_unoriented)->strides));
     for (CeedInt i = 0; i < 3; i++) (*rstr_unoriented)->strides[i] = rstr->strides[i];
@@ -1199,6 +1293,7 @@ int CeedElemRestrictionCreateVector(CeedElemRestriction rstr, CeedVector *l_vec,
   CeedCall(CeedElemRestrictionGetEVectorSize(rstr, &e_size));
   if (l_vec) CeedCall(CeedVectorCreate(ceed, l_size, l_vec));
   if (e_vec) CeedCall(CeedVectorCreate(ceed, e_size, e_vec));
+  CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1219,9 +1314,7 @@ int CeedElemRestrictionCreateVector(CeedElemRestriction rstr, CeedVector *l_vec,
 int CeedElemRestrictionApply(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector ru, CeedRequest *request) {
   CeedSize min_u_len, min_ru_len, len;
   CeedInt  num_elem;
-  Ceed     ceed;
 
-  CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed));
   if (t_mode == CEED_NOTRANSPOSE) {
     CeedCall(CeedElemRestrictionGetEVectorSize(rstr, &min_ru_len));
     CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &min_u_len));
@@ -1230,11 +1323,11 @@ int CeedElemRestrictionApply(CeedElemRestriction rstr, CeedTransposeMode t_mode,
     CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &min_ru_len));
   }
   CeedCall(CeedVectorGetLength(u, &len));
-  CeedCheck(min_u_len <= len, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(min_u_len <= len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Input vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", len, min_ru_len,
             min_u_len);
   CeedCall(CeedVectorGetLength(ru, &len));
-  CeedCheck(min_ru_len <= len, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(min_ru_len <= len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Output vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", len, min_u_len,
             min_ru_len);
   CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem));
@@ -1261,9 +1354,10 @@ int CeedElemRestrictionApplyAtPointsInElement(CeedElemRestriction rstr, CeedInt
                                               CeedRequest *request) {
   CeedSize min_u_len, min_ru_len, len;
   CeedInt  num_elem;
-  Ceed     ceed;
 
-  CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed));
+  CeedCheck(rstr->ApplyAtPointsInElement, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED,
+            "Backend does not implement CeedElemRestrictionApplyAtPointsInElement");
+
   if (t_mode == CEED_NOTRANSPOSE) {
     CeedInt num_points, num_comp;
 
@@ -1280,17 +1374,17 @@ int CeedElemRestrictionApplyAtPointsInElement(CeedElemRestriction rstr, CeedInt
     CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &min_ru_len));
   }
   CeedCall(CeedVectorGetLength(u, &len));
-  CeedCheck(min_u_len <= len, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(min_u_len <= len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Input vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT
             ") for element %" CeedInt_FMT,
             len, min_ru_len, min_u_len, elem);
   CeedCall(CeedVectorGetLength(ru, &len));
-  CeedCheck(min_ru_len <= len, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(min_ru_len <= len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Output vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT
             ") for element %" CeedInt_FMT,
             len, min_ru_len, min_u_len, elem);
   CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCheck(elem < num_elem, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(elem < num_elem, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Cannot retrieve element %" CeedInt_FMT ", element %" CeedInt_FMT " > total elements %" CeedInt_FMT "", elem, elem, num_elem);
   if (num_elem > 0) CeedCall(rstr->ApplyAtPointsInElement(rstr, elem, t_mode, u, ru, request));
   return CEED_ERROR_SUCCESS;
@@ -1315,10 +1409,9 @@ int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, CeedT
                                   CeedRequest *request) {
   CeedSize min_u_len, min_ru_len, len;
   CeedInt  block_size, num_elem;
-  Ceed     ceed;
 
-  CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed));
-  CeedCheck(rstr->ApplyBlock, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedElemRestrictionApplyBlock");
+  CeedCheck(rstr->ApplyBlock, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_UNSUPPORTED,
+            "Backend does not implement CeedElemRestrictionApplyBlock");
 
   CeedCall(CeedElemRestrictionGetBlockSize(rstr, &block_size));
   if (t_mode == CEED_NOTRANSPOSE) {
@@ -1337,15 +1430,15 @@ int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, CeedT
     min_u_len = (CeedSize)block_size * (CeedSize)elem_size * (CeedSize)num_comp;
   }
   CeedCall(CeedVectorGetLength(u, &len));
-  CeedCheck(min_u_len == len, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(min_u_len == len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Input vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", len, min_u_len,
             min_ru_len);
   CeedCall(CeedVectorGetLength(ru, &len));
-  CeedCheck(min_ru_len == len, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(min_ru_len == len, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Output vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", len, min_ru_len,
             min_u_len);
   CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCheck(block_size * block <= num_elem, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(block_size * block <= num_elem, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_DIMENSION,
             "Cannot retrieve block %" CeedInt_FMT ", element %" CeedInt_FMT " > total elements %" CeedInt_FMT "", block, block_size * block,
             num_elem);
   CeedCall(rstr->ApplyBlock(rstr, block, t_mode, u, ru, request));
@@ -1363,7 +1456,7 @@ int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, CeedT
   @ref Advanced
 **/
 int CeedElemRestrictionGetCeed(CeedElemRestriction rstr, Ceed *ceed) {
-  *ceed = CeedElemRestrictionReturnCeed(rstr);
+  CeedCall(CeedObjectGetCeed((CeedObject)rstr, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1376,7 +1469,7 @@ int CeedElemRestrictionGetCeed(CeedElemRestriction rstr, Ceed *ceed) {
 
   @ref Advanced
 **/
-Ceed CeedElemRestrictionReturnCeed(CeedElemRestriction rstr) { return rstr->ceed; }
+Ceed CeedElemRestrictionReturnCeed(CeedElemRestriction rstr) { return CeedObjectReturnCeed((CeedObject)rstr); }
 
 /**
   @brief Get the L-vector component stride
@@ -1425,10 +1518,10 @@ int CeedElemRestrictionGetElementSize(CeedElemRestriction rstr, CeedInt *elem_si
 
 /**
 
-  @brief Get the number of points in the l-vector for a points `CeedElemRestriction`
+  @brief Get the number of points in the offsets array for a points `CeedElemRestriction`
 
   @param[in]  rstr       `CeedElemRestriction`
-  @param[out] num_points The number of points in the l-vector
+  @param[out] num_points The number of points in the offsets array
 
   @return An error code: 0 - success, otherwise - failure
 
@@ -1472,34 +1565,77 @@ int CeedElemRestrictionGetNumPointsInElement(CeedElemRestriction rstr, CeedInt e
 }
 
 /**
-  @brief Get the maximum number of points in an element for a `CeedElemRestriction` at points
+  @brief Get the minimum and/or maximum number of points in an element for a `CeedElemRestriction` at points
 
   @param[in]  rstr       `CeedElemRestriction`
-  @param[out] max_points Variable to store size of elements
+  @param[out] min_points Variable to minimum number of points in an element, or `NULL`
+  @param[out] max_points Variable to maximum number of points in an element, or `NULL`
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref Advanced
 **/
-int CeedElemRestrictionGetMaxPointsInElement(CeedElemRestriction rstr, CeedInt *max_points) {
-  CeedInt             num_elem;
+int CeedElemRestrictionGetMinMaxPointsInElement(CeedElemRestriction rstr, CeedInt *min_points, CeedInt *max_points) {
+  CeedInt             num_elem, num_points;
   CeedRestrictionType rstr_type;
 
   CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
   CeedCheck(rstr_type == CEED_RESTRICTION_POINTS, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_INCOMPATIBLE,
-            "Cannot compute max points for a CeedElemRestriction that does not use points");
+            "Cannot compute min/max points for a CeedElemRestriction that does not use points");
 
   CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  *max_points = 0;
-  for (CeedInt e = 0; e < num_elem; e++) {
-    CeedInt num_points;
 
+  // Exit early if there are no elements
+  if (num_elem == 0) {
+    if (min_points) *min_points = 0;
+    if (max_points) *max_points = 0;
+    return CEED_ERROR_SUCCESS;
+  }
+
+  // Initialize to the number of points in the first element
+  CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr, 0, &num_points));
+  if (min_points) *min_points = num_points;
+  if (max_points) *max_points = num_points;
+  for (CeedInt e = 1; e < num_elem; e++) {
     CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr, e, &num_points));
-    *max_points = CeedIntMax(num_points, *max_points);
+    if (min_points) *min_points = CeedIntMin(num_points, *min_points);
+    if (max_points) *max_points = CeedIntMax(num_points, *max_points);
   }
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the maximum number of points in an element for a `CeedElemRestriction` at points
+
+  @param[in]  rstr       `CeedElemRestriction`
+  @param[out] max_points Variable to store maximum number of points in an element
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+
+  @see CeedElemRestrictionGetMinMaxPointsInElement()
+**/
+int CeedElemRestrictionGetMaxPointsInElement(CeedElemRestriction rstr, CeedInt *max_points) {
+  return CeedElemRestrictionGetMinMaxPointsInElement(rstr, NULL, max_points);
+}
+
+/**
+  @brief Get the minimum number of points in an element for a `CeedElemRestriction` at points
+
+  @param[in]  rstr       `CeedElemRestriction`
+  @param[out] min_points Variable to store minimum number of points in an element
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+
+  @see CeedElemRestrictionGetMinMaxPointsInElement()
+**/
+int CeedElemRestrictionGetMinPointsInElement(CeedElemRestriction rstr, CeedInt *min_points) {
+  return CeedElemRestrictionGetMinMaxPointsInElement(rstr, min_points, NULL);
+}
+
 /**
   @brief Get the size of the l-vector for a `CeedElemRestriction`
 
@@ -1602,6 +1738,36 @@ int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the number of tabs to indent for @ref CeedElemRestrictionView() output
+
+  @param[in] rstr     `CeedElemRestriction` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedElemRestrictionSetNumViewTabs(CeedElemRestriction rstr, CeedInt num_tabs) {
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)rstr, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get the number of tabs to indent for @ref CeedElemRestrictionView() output
+
+  @param[in]  rstr     `CeedElemRestriction` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedElemRestrictionGetNumViewTabs(CeedElemRestriction rstr, CeedInt *num_tabs) {
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)rstr, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedElemRestriction`
 
@@ -1613,17 +1779,26 @@ int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult
   @ref User
 **/
 int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream) {
+  char               *tabs = NULL;
   CeedRestrictionType rstr_type;
 
+  {
+    CeedInt num_tabs = 0;
+
+    CeedCall(CeedElemRestrictionGetNumViewTabs(rstr, &num_tabs));
+    CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
+  }
+
   CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
   if (rstr_type == CEED_RESTRICTION_POINTS) {
     CeedInt max_points;
 
     CeedCall(CeedElemRestrictionGetMaxPointsInElement(rstr, &max_points));
     fprintf(stream,
-            "CeedElemRestriction at points from (%" CeedSize_FMT ", %" CeedInt_FMT ") to %" CeedInt_FMT " elements with a maximum of %" CeedInt_FMT
+            "%sCeedElemRestriction at points from (%" CeedSize_FMT ", %" CeedInt_FMT ") to %" CeedInt_FMT " elements with a maximum of %" CeedInt_FMT
             " points on an element\n",
-            rstr->l_size, rstr->num_comp, rstr->num_elem, max_points);
+            tabs, rstr->l_size, rstr->num_comp, rstr->num_elem, max_points);
   } else {
     char strides_str[500];
 
@@ -1633,11 +1808,12 @@ int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream) {
       sprintf(strides_str, "%" CeedInt_FMT, rstr->comp_stride);
     }
     fprintf(stream,
-            "%sCeedElemRestriction from (%" CeedSize_FMT ", %" CeedInt_FMT ") to %" CeedInt_FMT " elements with %" CeedInt_FMT
+            "%s%sCeedElemRestriction from (%" CeedSize_FMT ", %" CeedInt_FMT ") to %" CeedInt_FMT " elements with %" CeedInt_FMT
             " nodes each and %s %s\n",
-            rstr->block_size > 1 ? "Blocked " : "", rstr->l_size, rstr->num_comp, rstr->num_elem, rstr->elem_size,
+            tabs, rstr->block_size > 1 ? "Blocked " : "", rstr->l_size, rstr->num_comp, rstr->num_elem, rstr->elem_size,
             rstr->strides ? "strides" : "component stride", strides_str);
   }
+  CeedCall(CeedFree(&tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1651,11 +1827,11 @@ int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream) {
   @ref User
 **/
 int CeedElemRestrictionDestroy(CeedElemRestriction *rstr) {
-  if (!*rstr || *rstr == CEED_ELEMRESTRICTION_NONE || --(*rstr)->ref_count > 0) {
+  if (!*rstr || *rstr == CEED_ELEMRESTRICTION_NONE || CeedObjectDereference((CeedObject)*rstr) > 0) {
     *rstr = NULL;
     return CEED_ERROR_SUCCESS;
   }
-  CeedCheck((*rstr)->num_readers == 0, (*rstr)->ceed, CEED_ERROR_ACCESS,
+  CeedCheck((*rstr)->num_readers == 0, CeedElemRestrictionReturnCeed(*rstr), CEED_ERROR_ACCESS,
             "Cannot destroy CeedElemRestriction, a process has read access to the offset data");
 
   // Only destroy backend data once between rstr and unsigned copy
@@ -1663,7 +1839,7 @@ int CeedElemRestrictionDestroy(CeedElemRestriction *rstr) {
   else if ((*rstr)->Destroy) CeedCall((*rstr)->Destroy(*rstr));
 
   CeedCall(CeedFree(&(*rstr)->strides));
-  CeedCall(CeedDestroy(&(*rstr)->ceed));
+  CeedCall(CeedObjectDestroy_Private(&(*rstr)->obj));
   CeedCall(CeedFree(rstr));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-fortran.c b/interface/ceed-fortran.c
index d3ed061c3e..042d7ae014 100644
--- a/interface/ceed-fortran.c
+++ b/interface/ceed-fortran.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -85,6 +85,9 @@ CEED_EXTERN void fCeedIsDeterministic(int *ceed, int *is_deterministic, int *err
 #define fCeedGetPreferredMemType FORTRAN_NAME(ceedgetpreferredmemtype, CEEDGETPREFERREDMEMTYPE)
 CEED_EXTERN void fCeedGetPreferredMemType(int *ceed, int *type, int *err) { *err = CeedGetPreferredMemType(Ceed_dict[*ceed], (CeedMemType *)type); }
 
+#define fCeedSetNumViewTabs FORTRAN_NAME(ceedsetnumviewtabs, CEEDSETNUMVIEWTABS)
+CEED_EXTERN void fCeedSetNumViewTabs(int *ceed, int *num_tabs, int *err) { *err = CeedSetNumViewTabs(Ceed_dict[*ceed], *num_tabs); }
+
 #define fCeedView FORTRAN_NAME(ceedview, CEEDVIEW)
 CEED_EXTERN void fCeedView(int *ceed, int *err) { *err = CeedView(Ceed_dict[*ceed], stdout); }
 
@@ -192,6 +195,9 @@ CEED_EXTERN void fCeedVectorNorm(int *vec, int *type, CeedScalar *norm, int *err
 #define fCeedVectorReciprocal FORTRAN_NAME(ceedvectorreciprocal, CEEDVECTORRECIPROCAL)
 CEED_EXTERN void fCeedVectorReciprocal(int *vec, int *err) { *err = CeedVectorReciprocal(CeedVector_dict[*vec]); }
 
+#define fCeedVectorSetNumViewTabs FORTRAN_NAME(ceedvectorsetnumviewtabs, CEEDVECTORSETNUMVIEWTABS)
+CEED_EXTERN void fCeedVectorSetNumViewTabs(int *vec, int *num_tabs, int *err) { *err = CeedVectorSetNumViewTabs(CeedVector_dict[*vec], *num_tabs); }
+
 #define fCeedVectorView FORTRAN_NAME(ceedvectorview, CEEDVECTORVIEW)
 CEED_EXTERN void fCeedVectorView(int *vec, int *err) { *err = CeedVectorView(CeedVector_dict[*vec], "%12.8f", stdout); }
 
@@ -449,6 +455,11 @@ CEED_EXTERN void fCeedElemRestrictionGetELayout(int *elemr, int *layout, int *er
   for (int i = 0; i < 3; i++) layout[i] = layout_c[i];
 }
 
+#define fCeedElemRestrictionSetNumViewTabs FORTRAN_NAME(ceedelemrestrictionsetnumviewtabs, CEEDELEMRESTRICTIONSETNUMVIEWTABS)
+CEED_EXTERN void fCeedElemRestrictionSetNumViewTabs(int *elemr, int *num_tabs, int *err) {
+  *err = CeedElemRestrictionSetNumViewTabs(CeedElemRestriction_dict[*elemr], *num_tabs);
+}
+
 #define fCeedElemRestrictionView FORTRAN_NAME(ceedelemrestrictionview, CEEDELEMRESTRICTIONVIEW)
 CEED_EXTERN void fCeedElemRestrictionView(int *elemr, int *err) { *err = CeedElemRestrictionView(CeedElemRestriction_dict[*elemr], stdout); }
 
@@ -575,6 +586,9 @@ CEED_EXTERN void fCeedBasisCreateHcurl(int *ceed, int *topo, int *num_comp, int
   }
 }
 
+#define fCeedBasisSetNumViewTabs FORTRAN_NAME(ceedbasissetnumviewtabs, CEEDBASISSETNUMVIEWTABS)
+CEED_EXTERN void fCeedBasisSetNumViewTabs(int *basis, int *num_tabs, int *err) { *err = CeedBasisSetNumViewTabs(CeedBasis_dict[*basis], *num_tabs); }
+
 #define fCeedBasisView FORTRAN_NAME(ceedbasisview, CEEDBASISVIEW)
 CEED_EXTERN void fCeedBasisView(int *basis, int *err) { *err = CeedBasisView(CeedBasis_dict[*basis], stdout); }
 
@@ -688,6 +702,11 @@ CEED_EXTERN void fCeedQFunctionContextRestoreData(int *ctx, CeedScalar *data, in
   *offset = 0;
 }
 
+#define fCeedQFunctionContextSetNumViewTabs FORTRAN_NAME(ceedqfunctioncontextsetnumviewtabs, CEEDQFUNCTIONCONTEXTSETNUMVIEWTABS)
+CEED_EXTERN void fCeedQFunctionContextSetNumViewTabs(int *ctx, int *num_tabs, int *err) {
+  *err = CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext_dict[*ctx], *num_tabs);
+}
+
 #define fCeedQFunctionContextView FORTRAN_NAME(ceedqfunctioncontextview, CEEDQFUNCTIONCONTEXTVIEW)
 CEED_EXTERN void fCeedQFunctionContextView(int *ctx, int *err) { *err = CeedQFunctionContextView(CeedQFunctionContext_dict[*ctx], stdout); }
 
@@ -845,6 +864,13 @@ CEED_EXTERN void fCeedQFunctionSetContext(int *qf, int *ctx, int *err) {
   if (*err) return;
   fctxdata->inner_ctx = ctx_;
   *err                = CeedQFunctionContextRestoreData(fctx, (void **)&fctxdata);
+  if (*err) return;
+  *err = CeedQFunctionContextDestroy(&fctx);
+}
+
+#define fCeedQFunctionSetNumViewTabs FORTRAN_NAME(ceedqfunctionsetnumviewtabs, CEEDQFUNCTIONSETNUMVIEWTABS)
+CEED_EXTERN void fCeedQFunctionSetNumViewTabs(int *qf, int *num_tabs, int *err) {
+  *err = CeedQFunctionSetNumViewTabs(CeedQFunction_dict[*qf], *num_tabs);
 }
 
 #define fCeedQFunctionView FORTRAN_NAME(ceedqfunctionview, CEEDQFUNCTIONVIEW)
@@ -949,8 +975,8 @@ CEED_EXTERN void fCeedOperatorCreate(int *ceed, int *qf, int *dqf, int *dqfT, in
   CeedOperator_n++;
 }
 
-#define fCeedCompositeOperatorCreate FORTRAN_NAME(ceedcompositeoperatorcreate, CEEDCOMPOSITEOPERATORCREATE)
-CEED_EXTERN void fCeedCompositeOperatorCreate(int *ceed, int *op, int *err) {
+#define fCeedOperatorCreateComposite FORTRAN_NAME(ceedoperatorcreatecomposite, CEEDOPERATORCREATECOMPOSITE)
+CEED_EXTERN void fCeedOperatorCreateComposite(int *ceed, int *op, int *err) {
   if (CeedOperator_count == CeedOperator_count_max) {
     CeedOperator_count_max += CeedOperator_count_max / 2 + 1;
     CeedRealloc(CeedOperator_count_max, &CeedOperator_dict);
@@ -958,7 +984,7 @@ CEED_EXTERN void fCeedCompositeOperatorCreate(int *ceed, int *op, int *err) {
 
   CeedOperator *op_ = &CeedOperator_dict[CeedOperator_count];
 
-  *err = CeedCompositeOperatorCreate(Ceed_dict[*ceed], op_);
+  *err = CeedOperatorCreateComposite(Ceed_dict[*ceed], op_);
   if (*err) return;
   *op = CeedOperator_count++;
   CeedOperator_n++;
@@ -1001,12 +1027,12 @@ CEED_EXTERN void fCeedOperatorSetField(int *op, const char *field_name, int *r,
   *err = CeedOperatorSetField(op_, field_name_c, r_, b_, v_);
 }
 
-#define fCeedCompositeOperatorAddSub FORTRAN_NAME(ceedcompositeoperatoraddsub, CEEDCOMPOSITEOPERATORADDSUB)
-CEED_EXTERN void fCeedCompositeOperatorAddSub(int *compositeop, int *subop, int *err) {
+#define fCeedOperatorCompositeAddSub FORTRAN_NAME(ceedoperatorcompositeaddsub, CEEDOPERATORCOMPOSITEADDSUB)
+CEED_EXTERN void fCeedOperatorCompositeAddSub(int *compositeop, int *subop, int *err) {
   CeedOperator compositeop_ = CeedOperator_dict[*compositeop];
   CeedOperator subop_       = CeedOperator_dict[*subop];
 
-  *err = CeedCompositeOperatorAddSub(compositeop_, subop_);
+  *err = CeedOperatorCompositeAddSub(compositeop_, subop_);
 }
 
 #define fCeedOperatorSetName FORTRAN_NAME(ceedoperatorsetname, CEEDOPERATORSETNAME)
@@ -1017,6 +1043,13 @@ CEED_EXTERN void fCeedOperatorSetName(int *op, const char *name, int *err, fortr
   *err = CeedOperatorSetName(op_, name_c);
 }
 
+#define fCeedOperatorSetNumViewTabs FORTRAN_NAME(ceedoperatorsetnumviewtabs, CEEDOPERATORSETNUMVIEWTABS)
+CEED_EXTERN void fCeedOperatorSetNumViewTabs(int *op, int *ntabs, int *err) {
+  CeedOperator op_ = CeedOperator_dict[*op];
+
+  *err = CeedOperatorSetNumViewTabs(op_, *ntabs);
+}
+
 #define fCeedOperatorLinearAssembleQFunction FORTRAN_NAME(ceedoperatorlinearassembleqfunction, CEEDOPERATORLINEARASSEMBLEQFUNCTION)
 CEED_EXTERN void fCeedOperatorLinearAssembleQFunction(int *op, int *assembledvec, int *assembledrstr, int *rqst, int *err) {
   // Vector
diff --git a/interface/ceed-hip.c b/interface/ceed-hip.c
index cc4a625853..f35480e873 100644
--- a/interface/ceed-hip.c
+++ b/interface/ceed-hip.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -23,10 +23,7 @@
 **/
 int CeedQFunctionSetHIPUserFunction(CeedQFunction qf, hipFunction_t f) {
   if (!qf->SetHIPUserFunction) {
-    Ceed ceed;
-
-    CeedCall(CeedQFunctionGetCeed(qf, &ceed));
-    CeedDebug(ceed, "Backend does not support hipFunction_t pointers for QFunctions.");
+    CeedDebug(CeedQFunctionReturnCeed(qf), "Backend does not support hipFunction_t pointers for QFunctions.");
   } else {
     CeedCall(qf->SetHIPUserFunction(qf, f));
   }
diff --git a/interface/ceed-jit-source-root-default.c b/interface/ceed-jit-source-root-default.c
index 6f1bc47e6c..2cee49718f 100644
--- a/interface/ceed-jit-source-root-default.c
+++ b/interface/ceed-jit-source-root-default.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-jit-source-root-install.c b/interface/ceed-jit-source-root-install.c
index ffa78b21d5..b80dca4f9f 100644
--- a/interface/ceed-jit-source-root-install.c
+++ b/interface/ceed-jit-source-root-install.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-jit-tools.c b/interface/ceed-jit-tools.c
index 4d4bf44e51..c50e683f9a 100644
--- a/interface/ceed-jit-tools.c
+++ b/interface/ceed-jit-tools.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -97,7 +97,7 @@ static int CeedNormalizePath(Ceed ceed, const char *source_file_path, char **nor
 
         while (last_slash[0] != '/' && last_slash != *normalized_source_file_path) last_slash--;
         CeedCheck(last_slash != *normalized_source_file_path, ceed, CEED_ERROR_MAJOR, "Malformed source path %s", source_file_path);
-        for (CeedInt i = 0; first_dot[i - 1]; i++) last_slash[i] = first_dot[i + 2];
+        for (CeedInt i = 0; first_dot[i + 1]; i++) last_slash[i] = first_dot[i + 2];
         search_from = last_slash;
       }
     }
@@ -130,8 +130,6 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, C
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- Ceed JiT ----------\n");
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Current source file: ");
   CeedDebug(ceed, "%s\n", source_file_path);
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Current buffer:\n");
-  CeedDebug(ceed, "%s\n", *buffer);
 
   // Read file to temporary buffer
   source_file = fopen(source_file_path, "rb");
@@ -139,9 +137,15 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, C
   // -- Compute size of source
   fseek(source_file, 0L, SEEK_END);
   file_size = ftell(source_file);
-  rewind(source_file);
+  fseek(source_file, 0L, SEEK_SET);
   //  -- Allocate memory for entire source file
-  CeedCall(CeedCalloc(file_size + 1, &temp_buffer));
+  {
+    const int ierr = CeedCalloc(file_size + 1, &temp_buffer);
+
+    // Close stream before error handling, if necessary
+    if (ierr != CEED_ERROR_SUCCESS) fclose(source_file);
+    CeedCall(ierr);
+  }
   // -- Copy the file into the buffer
   if (1 != fread(temp_buffer, file_size, 1, source_file)) {
     // LCOV_EXCL_START
@@ -216,6 +220,9 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, C
       bool  is_ceed_header    = next_left_chevron && (next_new_line - next_left_chevron > 0) &&
                             (!strncmp(next_left_chevron, "<ceed/jit-source/", 17) || !strncmp(next_left_chevron, "<ceed/types.h>", 14) ||
                              !strncmp(next_left_chevron, "<ceed/ceed-f32.h>", 17) || !strncmp(next_left_chevron, "<ceed/ceed-f64.h>", 17));
+      bool is_std_header =
+          next_left_chevron && (next_new_line - next_left_chevron > 0) &&
+          (!strncmp(next_left_chevron, "<std", 4) || !strncmp(next_left_chevron, "<math.h>", 8) || !strncmp(next_left_chevron, "<ceed", 5));
 
       if (is_local_header || is_ceed_header) {
         // ---- Build source path
@@ -254,6 +261,13 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, C
         }
         CeedCall(CeedFree(&include_source_path));
         CeedCall(CeedFree(&normalized_include_source_path));
+      } else if (!is_std_header) {
+        long header_copy_size = next_new_line - first_hash + 1;
+
+        CeedCall(CeedRealloc(current_size + copy_size + header_copy_size + 2, buffer));
+        memcpy(&(*buffer)[current_size + copy_size], "\n", 2);
+        memcpy(&(*buffer)[current_size + copy_size + 1], first_hash, header_copy_size);
+        memcpy(&(*buffer)[current_size + copy_size + header_copy_size], "", 1);
       }
       file_offset = strchr(first_hash, '\n') - temp_buffer + 1;
     }
@@ -390,28 +404,33 @@ int CeedGetJitRelativePath(const char *absolute_file_path, const char **relative
   @ref Backend
 **/
 int CeedGetJitAbsolutePath(Ceed ceed, const char *relative_file_path, const char **absolute_file_path) {
-  Ceed ceed_parent;
+  const char **jit_source_dirs;
+  CeedInt      num_source_dirs;
 
   // Debug
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- Ceed JiT ----------\n");
   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Relative JiT source file: ");
   CeedDebug(ceed, "%s\n", relative_file_path);
 
-  CeedCall(CeedGetParent(ceed, &ceed_parent));
-  for (CeedInt i = 0; i < ceed_parent->num_jit_source_roots; i++) {
+  CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_source_dirs, &jit_source_dirs));
+  for (CeedInt i = 0; i < num_source_dirs; i++) {
     bool is_valid;
 
     // Debug
     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Checking JiT root: ");
-    CeedDebug(ceed, "%s\n", ceed_parent->jit_source_roots[i]);
+    CeedDebug(ceed, "%s\n", jit_source_dirs[i]);
 
     // Build and check absolute path with current root
-    CeedCall(CeedPathConcatenate(ceed, ceed_parent->jit_source_roots[i], relative_file_path, (char **)absolute_file_path));
+    CeedCall(CeedPathConcatenate(ceed, jit_source_dirs[i], relative_file_path, (char **)absolute_file_path));
     CeedCall(CeedCheckFilePath(ceed, *absolute_file_path, &is_valid));
 
-    if (is_valid) return CEED_ERROR_SUCCESS;
+    if (is_valid) {
+      CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
+      return CEED_ERROR_SUCCESS;
+    }
     // LCOV_EXCL_START
-    else CeedCall(CeedFree(absolute_file_path));
+    else
+      CeedCall(CeedFree(absolute_file_path));
     // LCOV_EXCL_STOP
   }
   // LCOV_EXCL_START
diff --git a/interface/ceed-object.c b/interface/ceed-object.c
new file mode 100644
index 0000000000..24b38dbb31
--- /dev/null
+++ b/interface/ceed-object.c
@@ -0,0 +1,185 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed-impl.h>
+#include <ceed.h>
+#include <ceed/backend.h>
+
+/// @file
+/// Implementation of CeedObject functionality
+
+/// ----------------------------------------------------------------------------
+/// CeedObject Backend API
+/// ----------------------------------------------------------------------------
+/// @addtogroup CeedBackend
+/// @{
+
+/**
+  @brief Create a `CeedObject`.
+
+  Note: This interface takes a `CeedObject` and not a pointer to a `CeedObject` like other `Ceed*Create` interfaces.
+          This `CeedObject` will have already been allocated a the first part of the `Ceed*` struct.
+          This function is only intended to be called inside of `Ceed*Create` functions.
+
+  @param[in]  ceed             `Ceed` object to reference
+  @param[in]  view_function    `Ceed*` function for viewing the `obj`
+  @param[in]  destroy_function `Ceed*` function for destroying the `obj`
+  @param[out] obj              Address of the variable where is `CeedObject` exists
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedObjectCreate(Ceed ceed, int (*view_function)(CeedObject, FILE *), int (*destroy_function)(CeedObject *), CeedObject obj) {
+  obj->ceed = NULL;
+  if (ceed) CeedCall(CeedReferenceCopy(ceed, &obj->ceed));
+  obj->View = view_function;
+  CeedCheck(destroy_function, CeedObjectReturnCeed(obj), CEED_ERROR_UNSUPPORTED, "Must provide destroy function to create CeedObject");
+  obj->Destroy   = destroy_function;
+  obj->ref_count = 1;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Increment the reference counter for a `CeedObject`
+
+  @param[in,out] obj `CeedObject` to increment the reference counter
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedObjectReference(CeedObject obj) {
+  obj->ref_count++;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Decrement the reference counter for a `CeedObject`
+
+  @param[in,out] obj `CeedObject` to decrement the reference counter
+
+  @return The new reference count
+
+  @ref Backend
+**/
+int CeedObjectDereference(CeedObject obj) {
+  return --obj->ref_count;  // prefix notation, to get new number of references
+}
+
+/**
+  @brief Destroy a @ref CeedObject
+
+  @param[in,out] obj `CeedObject` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedObjectDestroy_Private(CeedObject obj) {
+  CeedCheck(obj->ref_count == 0, CeedObjectReturnCeed(obj), CEED_ERROR_UNSUPPORTED,
+            "Cannot destroy CeedObject, it is still referenced by another object");
+  if (obj->ceed) CeedCall(CeedDestroy(&obj->ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+/// @}
+
+/// ----------------------------------------------------------------------------
+/// CeedObject Public API
+/// ----------------------------------------------------------------------------
+/// @addtogroup CeedUser
+/// @{
+
+/**
+  @brief View a `CeedObject`
+
+  @param[in] obj    `CeedObject` to view
+  @param[in] stream Stream to view to, e.g., `stdout`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedObjectView(CeedObject obj, FILE *stream) {
+  if (obj->View) CeedCall(obj->View(obj, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Set the number of tabs to indent for @ref CeedObjectView() output
+
+  @param[in] obj      `CeedObject` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedObjectSetNumViewTabs(CeedObject obj, CeedInt num_tabs) {
+  CeedCheck(num_tabs >= 0, CeedObjectReturnCeed(obj), CEED_ERROR_MINOR, "Number of view tabs must be non-negative");
+  obj->num_view_tabs = num_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get the number of tabs to indent for @ref CeedObjectView() output
+
+  @param[in]  obj      `CeedObject` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedObjectGetNumViewTabs(CeedObject obj, CeedInt *num_tabs) {
+  *num_tabs = obj->num_view_tabs;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get the `Ceed` associated with a `CeedObject`
+
+  @param[in]  obj   `CeedObject`
+  @param[out] ceed  Variable to store `Ceed`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Advanced
+**/
+int CeedObjectGetCeed(CeedObject obj, Ceed *ceed) {
+  *ceed = NULL;
+  CeedCall(CeedReferenceCopy(CeedObjectReturnCeed(obj), ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Return the `Ceed` associated with a `CeedObject`
+
+  @param[in] obj `CeedObject`
+
+  @return `Ceed` associated with the `basis`
+
+  @ref Advanced
+**/
+Ceed CeedObjectReturnCeed(CeedObject obj) { return (obj->ceed) ? obj->ceed : (Ceed)obj; }
+
+/**
+  @brief Destroy a @ref CeedObject
+
+  @param[in,out] obj Address of `CeedObject` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedObjectDestroy(CeedObject *obj) {
+  CeedCall((*obj)->Destroy(obj));
+  return CEED_ERROR_SUCCESS;
+}
+
+/// @}
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index a4645dd2c6..55e72cb2c5 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -87,17 +87,17 @@ static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedEl
   @param[in] op_field     `CeedOperator` Field to view
   @param[in] qf_field     `CeedQFunction` Field (carries field name)
   @param[in] field_number Number of field being viewed
-  @param[in] sub          true indicates sub-operator, which increases indentation; false for top-level operator
-  @param[in] input        true for an input field; false for output field
+  @param[in] tabs         Tabs to append before each line
+  @param[in] is_input    `true` for an input field; `false` for output field
   @param[in] stream       Stream to view to, e.g., `stdout`
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref Utility
 **/
-static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt field_number, bool sub, bool input, FILE *stream) {
-  const char  *pre    = sub ? "  " : "";
-  const char  *in_out = input ? "Input" : "Output";
+static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt field_number, const char *tabs, bool is_input,
+                                 FILE *stream) {
+  const char  *field_type = is_input ? "Input" : "Output";
   const char  *field_name;
   CeedInt      size;
   CeedEvalMode eval_mode;
@@ -112,12 +112,15 @@ static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField
           "%s    %s field %" CeedInt_FMT
           ":\n"
           "%s      Name: \"%s\"\n",
-          pre, in_out, field_number, pre, field_name);
-  fprintf(stream, "%s      Size: %" CeedInt_FMT "\n", pre, size);
-  fprintf(stream, "%s      EvalMode: %s\n", pre, CeedEvalModes[eval_mode]);
-  if (basis == CEED_BASIS_NONE) fprintf(stream, "%s      No basis\n", pre);
-  if (vec == CEED_VECTOR_ACTIVE) fprintf(stream, "%s      Active vector\n", pre);
-  else if (vec == CEED_VECTOR_NONE) fprintf(stream, "%s      No vector\n", pre);
+          tabs, field_type, field_number, tabs, field_name);
+  fprintf(stream, "%s      Size: %" CeedInt_FMT "\n", tabs, size);
+  fprintf(stream, "%s      EvalMode: %s\n", tabs, CeedEvalModes[eval_mode]);
+  if (basis == CEED_BASIS_NONE) fprintf(stream, "%s      No basis\n", tabs);
+  if (vec == CEED_VECTOR_ACTIVE) fprintf(stream, "%s      Active vector\n", tabs);
+  else if (vec == CEED_VECTOR_NONE) fprintf(stream, "%s      No vector\n", tabs);
+
+  CeedCall(CeedVectorDestroy(&vec));
+  CeedCall(CeedBasisDestroy(&basis));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -125,42 +128,85 @@ static int CeedOperatorFieldView(CeedOperatorField op_field, CeedQFunctionField
   @brief View a single `CeedOperator`
 
   @param[in] op     `CeedOperator` to view
-  @param[in] sub    Boolean flag for sub-operator
+  @param[in] tabs   Tabs to append before each new line
   @param[in] stream Stream to write; typically `stdout` or a file
 
   @return Error code: 0 - success, otherwise - failure
 
   @ref Utility
 **/
-int CeedOperatorSingleView(CeedOperator op, bool sub, FILE *stream) {
-  const char         *pre = sub ? "  " : "";
+int CeedOperatorSingleView(CeedOperator op, const char *tabs, FILE *stream) {
+  bool                is_at_points;
   CeedInt             num_elem, num_qpts, total_fields = 0, num_input_fields, num_output_fields;
   CeedQFunction       qf;
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedOperatorField  *op_input_fields, *op_output_fields;
 
+  CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   CeedCall(CeedOperatorGetNumElements(op, &num_elem));
   CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts));
   CeedCall(CeedOperatorGetNumArgs(op, &total_fields));
   CeedCall(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCall(CeedOperatorGetQFunction(op, &qf));
   CeedCall(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  CeedCall(CeedQFunctionDestroy(&qf));
+
+  if (is_at_points) {
+    CeedInt             max_points = 0;
+    CeedElemRestriction rstr_points;
 
-  fprintf(stream, "%s  %" CeedInt_FMT " elements with %" CeedInt_FMT " quadrature points each\n", pre, num_elem, num_qpts);
-  fprintf(stream, "%s  %" CeedInt_FMT " field%s\n", pre, total_fields, total_fields > 1 ? "s" : "");
-  fprintf(stream, "%s  %" CeedInt_FMT " input field%s:\n", pre, num_input_fields, num_input_fields > 1 ? "s" : "");
+    CeedCall(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCall(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_points));
+    fprintf(stream, "%s  %" CeedInt_FMT " elements with %" CeedInt_FMT " max points each\n", tabs, num_elem, max_points);
+    CeedCall(CeedElemRestrictionDestroy(&rstr_points));
+  } else {
+    fprintf(stream, "%s  %" CeedInt_FMT " elements with %" CeedInt_FMT " quadrature points each\n", tabs, num_elem, num_qpts);
+  }
+  fprintf(stream, "%s  %" CeedInt_FMT " field%s\n", tabs, total_fields, total_fields > 1 ? "s" : "");
+  fprintf(stream, "%s  %" CeedInt_FMT " input field%s:\n", tabs, num_input_fields, num_input_fields > 1 ? "s" : "");
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCall(CeedOperatorFieldView(op_input_fields[i], qf_input_fields[i], i, sub, 1, stream));
+    CeedCall(CeedOperatorFieldView(op_input_fields[i], qf_input_fields[i], i, tabs, 1, stream));
   }
-  fprintf(stream, "%s  %" CeedInt_FMT " output field%s:\n", pre, num_output_fields, num_output_fields > 1 ? "s" : "");
+  fprintf(stream, "%s  %" CeedInt_FMT " output field%s:\n", tabs, num_output_fields, num_output_fields > 1 ? "s" : "");
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCall(CeedOperatorFieldView(op_output_fields[i], qf_output_fields[i], i, sub, 0, stream));
+    CeedCall(CeedOperatorFieldView(op_output_fields[i], qf_output_fields[i], i, tabs, 0, stream));
   }
   return CEED_ERROR_SUCCESS;
 }
 
 /**
-  @brief Find the active input vector `CeedBasis` for a non-composite `CeedOperator`
+  @brief View a `CeedOperator` passed as a `CeedObject`
+
+  @param[in] op     `CeedOperator` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedOperatorView_Object(CeedObject op, FILE *stream) {
+  CeedCall(CeedOperatorView((CeedOperator)op, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Destroy a `CeedOperator` passed as a `CeedObject`
+
+  @param[in,out] op Address of `CeedOperator` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedOperatorDestroy_Object(CeedObject *op) {
+  CeedCall(CeedOperatorDestroy((CeedOperator *)op));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Find the active input vector `CeedBasis` for a non-composite `CeedOperator`.
+
+  Note: Caller is responsible for destroying the `active_basis` with @ref CeedBasisDestroy().
 
   @param[in]  op           `CeedOperator` to find active `CeedBasis` for
   @param[out] active_basis `CeedBasis` for active input vector or `NULL` for composite operator
@@ -175,7 +221,9 @@ int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis) {
 }
 
 /**
-  @brief Find the active input and output vector `CeedBasis` for a non-composite `CeedOperator`
+  @brief Find the active input and output vector `CeedBasis` for a non-composite `CeedOperator`.
+
+  Note: Caller is responsible for destroying the bases with @ref CeedBasisDestroy().
 
   @param[in]  op                  `CeedOperator` to find active `CeedBasis` for
   @param[out] active_input_basis  `CeedBasis` for active input vector or `NULL` for composite operator
@@ -188,10 +236,8 @@ int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis) {
 int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, CeedBasis *active_output_basis) {
   bool               is_composite;
   CeedInt            num_input_fields, num_output_fields;
-  Ceed               ceed;
   CeedOperatorField *op_input_fields, *op_output_fields;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCall(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
 
@@ -206,11 +252,14 @@ int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, C
           CeedBasis basis;
 
           CeedCall(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-          CeedCheck(!*active_input_basis || *active_input_basis == basis, ceed, CEED_ERROR_MINOR, "Multiple active input CeedBases found");
-          *active_input_basis = basis;
+          CeedCheck(!*active_input_basis || *active_input_basis == basis, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR,
+                    "Multiple active input CeedBases found");
+          if (!*active_input_basis) CeedCall(CeedBasisReferenceCopy(basis, active_input_basis));
+          CeedCall(CeedBasisDestroy(&basis));
         }
+        CeedCall(CeedVectorDestroy(&vec));
       }
-      CeedCheck(*active_input_basis, ceed, CEED_ERROR_INCOMPLETE, "No active input CeedBasis found");
+      CeedCheck(*active_input_basis, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No active input CeedBasis found");
     }
   }
   if (active_output_basis) {
@@ -224,18 +273,23 @@ int CeedOperatorGetActiveBases(CeedOperator op, CeedBasis *active_input_basis, C
           CeedBasis basis;
 
           CeedCall(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-          CeedCheck(!*active_output_basis || *active_output_basis == basis, ceed, CEED_ERROR_MINOR, "Multiple active output CeedBases found");
-          *active_output_basis = basis;
+          CeedCheck(!*active_output_basis || *active_output_basis == basis, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR,
+                    "Multiple active output CeedBases found");
+          if (!*active_output_basis) CeedCall(CeedBasisReferenceCopy(basis, active_output_basis));
+          CeedCall(CeedBasisDestroy(&basis));
         }
+        CeedCall(CeedVectorDestroy(&vec));
       }
-      CeedCheck(*active_output_basis, ceed, CEED_ERROR_INCOMPLETE, "No active output CeedBasis found");
+      CeedCheck(*active_output_basis, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No active output CeedBasis found");
     }
   }
   return CEED_ERROR_SUCCESS;
 }
 
 /**
-  @brief Find the active vector `CeedElemRestriction` for a non-composite `CeedOperator`
+  @brief Find the active vector `CeedElemRestriction` for a non-composite `CeedOperator`.
+
+  Note: Caller is responsible for destroying the `active_rstr` with @ref CeedElemRestrictionDestroy().
 
   @param[in]  op          `CeedOperator` to find active `CeedElemRestriction` for
   @param[out] active_rstr `CeedElemRestriction` for active input vector or NULL for composite operator
@@ -250,7 +304,9 @@ int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *a
 }
 
 /**
-  @brief Find the active input and output vector `CeedElemRestriction` for a non-composite `CeedOperator`
+  @brief Find the active input and output vector `CeedElemRestriction` for a non-composite `CeedOperator`.
+
+  Note: Caller is responsible for destroying the restrictions with @ref CeedElemRestrictionDestroy().
 
   @param[in]  op                 `CeedOperator` to find active `CeedElemRestriction` for
   @param[out] active_input_rstr  `CeedElemRestriction` for active input vector or NULL for composite operator
@@ -263,10 +319,8 @@ int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *a
 int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction *active_input_rstr, CeedElemRestriction *active_output_rstr) {
   bool               is_composite;
   CeedInt            num_input_fields, num_output_fields;
-  Ceed               ceed;
   CeedOperatorField *op_input_fields, *op_output_fields;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCall(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
 
@@ -281,11 +335,14 @@ int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction *
           CeedElemRestriction rstr;
 
           CeedCall(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr));
-          CeedCheck(!*active_input_rstr || *active_input_rstr == rstr, ceed, CEED_ERROR_MINOR, "Multiple active input CeedElemRestrictions found");
-          *active_input_rstr = rstr;
+          CeedCheck(!*active_input_rstr || *active_input_rstr == rstr, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR,
+                    "Multiple active input CeedElemRestrictions found");
+          if (!*active_input_rstr) CeedCall(CeedElemRestrictionReferenceCopy(rstr, active_input_rstr));
+          CeedCall(CeedElemRestrictionDestroy(&rstr));
         }
+        CeedCall(CeedVectorDestroy(&vec));
       }
-      CeedCheck(*active_input_rstr, ceed, CEED_ERROR_INCOMPLETE, "No active input CeedElemRestriction found");
+      CeedCheck(*active_input_rstr, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No active input CeedElemRestriction found");
     }
   }
   if (active_output_rstr) {
@@ -299,11 +356,14 @@ int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction *
           CeedElemRestriction rstr;
 
           CeedCall(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr));
-          CeedCheck(!*active_output_rstr || *active_output_rstr == rstr, ceed, CEED_ERROR_MINOR, "Multiple active output CeedElemRestrictions found");
-          *active_output_rstr = rstr;
+          CeedCheck(!*active_output_rstr || *active_output_rstr == rstr, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR,
+                    "Multiple active output CeedElemRestrictions found");
+          if (!*active_output_rstr) CeedCall(CeedElemRestrictionReferenceCopy(rstr, active_output_rstr));
+          CeedCall(CeedElemRestrictionDestroy(&rstr));
         }
+        CeedCall(CeedVectorDestroy(&vec));
       }
-      CeedCheck(*active_output_rstr, ceed, CEED_ERROR_INCOMPLETE, "No active output CeedElemRestriction found");
+      CeedCheck(*active_output_rstr, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No active output CeedElemRestriction found");
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -322,14 +382,12 @@ int CeedOperatorGetActiveElemRestrictions(CeedOperator op, CeedElemRestriction *
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref User
+  @ref Developer
 **/
 static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, void *values) {
   bool is_composite = false;
-  Ceed ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
-  CeedCheck(field_label, ceed, CEED_ERROR_UNSUPPORTED, "Invalid field label");
+  CeedCheck(field_label, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Invalid field label");
 
   // Check if field_label and op correspond
   if (field_label->from_op) {
@@ -338,7 +396,7 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel
     for (CeedInt i = 0; i < op->num_context_labels; i++) {
       if (op->context_labels[i] == field_label) index = i;
     }
-    CeedCheck(index != -1, ceed, CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator");
+    CeedCheck(index != -1, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator");
   }
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -346,29 +404,28 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel
     CeedInt       num_sub;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-    CeedCheck(num_sub == field_label->num_sub_labels, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator modified after ContextFieldLabel created");
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
+    CeedCheck(num_sub == field_label->num_sub_labels, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+              "Composite operator modified after ContextFieldLabel created");
 
     for (CeedInt i = 0; i < num_sub; i++) {
-      CeedQFunction        qf;
       CeedQFunctionContext ctx;
 
-      CeedCall(CeedOperatorGetQFunction(sub_operators[i], &qf));
-      CeedCall(CeedQFunctionGetContext(qf, &ctx));
+      CeedCall(CeedOperatorGetContext(sub_operators[i], &ctx));
       // Try every sub-operator, ok if some sub-operators do not have field
-      if (field_label->sub_labels[i] && ctx) {
+      if (ctx && field_label->sub_labels[i]) {
         CeedCall(CeedQFunctionContextSetGeneric(ctx, field_label->sub_labels[i], field_type, values));
       }
+      CeedCall(CeedQFunctionContextDestroy(&ctx));
     }
   } else {
-    CeedQFunction        qf;
     CeedQFunctionContext ctx;
 
-    CeedCall(CeedOperatorGetQFunction(op, &qf));
-    CeedCall(CeedQFunctionGetContext(qf, &ctx));
-    CeedCheck(ctx, ceed, CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
+    CeedCall(CeedOperatorGetContext(op, &ctx));
+    CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
     CeedCall(CeedQFunctionContextSetGeneric(ctx, field_label, field_type, values));
+    CeedCall(CeedQFunctionContextDestroy(&ctx));
   }
   CeedCall(CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(op, true));
   return CEED_ERROR_SUCCESS;
@@ -388,15 +445,13 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref User
+  @ref Developer
 **/
 static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, size_t *num_values,
                                              void *values) {
   bool is_composite = false;
-  Ceed ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
-  CeedCheck(field_label, ceed, CEED_ERROR_UNSUPPORTED, "Invalid field label");
+  CeedCheck(field_label, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Invalid field label");
 
   *(void **)values = NULL;
   *num_values      = 0;
@@ -408,7 +463,7 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa
     for (CeedInt i = 0; i < op->num_context_labels; i++) {
       if (op->context_labels[i] == field_label) index = i;
     }
-    CeedCheck(index != -1, ceed, CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator");
+    CeedCheck(index != -1, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator");
   }
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -416,30 +471,30 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa
     CeedInt       num_sub;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-    CeedCheck(num_sub == field_label->num_sub_labels, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator modified after ContextFieldLabel created");
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
+    CeedCheck(num_sub == field_label->num_sub_labels, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+              "Composite operator modified after ContextFieldLabel created");
 
     for (CeedInt i = 0; i < num_sub; i++) {
-      CeedQFunction        qf;
       CeedQFunctionContext ctx;
 
-      CeedCall(CeedOperatorGetQFunction(sub_operators[i], &qf));
-      CeedCall(CeedQFunctionGetContext(qf, &ctx));
+      CeedCall(CeedOperatorGetContext(sub_operators[i], &ctx));
       // Try every sub-operator, ok if some sub-operators do not have field
-      if (field_label->sub_labels[i] && ctx) {
+      if (ctx && field_label->sub_labels[i]) {
         CeedCall(CeedQFunctionContextGetGenericRead(ctx, field_label->sub_labels[i], field_type, num_values, values));
+        CeedCall(CeedQFunctionContextDestroy(&ctx));
         return CEED_ERROR_SUCCESS;
       }
+      CeedCall(CeedQFunctionContextDestroy(&ctx));
     }
   } else {
-    CeedQFunction        qf;
     CeedQFunctionContext ctx;
 
-    CeedCall(CeedOperatorGetQFunction(op, &qf));
-    CeedCall(CeedQFunctionGetContext(qf, &ctx));
-    CeedCheck(ctx, ceed, CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
+    CeedCall(CeedOperatorGetContext(op, &ctx));
+    CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
     CeedCall(CeedQFunctionContextGetGenericRead(ctx, field_label, field_type, num_values, values));
+    CeedCall(CeedQFunctionContextDestroy(&ctx));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -457,14 +512,12 @@ static int CeedOperatorContextGetGenericRead(CeedOperator op, CeedContextFieldLa
 
   @return An error code: 0 - success, otherwise - failure
 
-  @ref User
+  @ref Developer
 **/
 static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, void *values) {
   bool is_composite = false;
-  Ceed ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
-  CeedCheck(field_label, ceed, CEED_ERROR_UNSUPPORTED, "Invalid field label");
+  CeedCheck(field_label, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Invalid field label");
 
   // Check if field_label and op correspond
   if (field_label->from_op) {
@@ -473,7 +526,7 @@ static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFie
     for (CeedInt i = 0; i < op->num_context_labels; i++) {
       if (op->context_labels[i] == field_label) index = i;
     }
-    CeedCheck(index != -1, ceed, CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator");
+    CeedCheck(index != -1, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "ContextFieldLabel does not correspond to the operator");
   }
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -481,30 +534,30 @@ static int CeedOperatorContextRestoreGenericRead(CeedOperator op, CeedContextFie
     CeedInt       num_sub;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-    CeedCheck(num_sub == field_label->num_sub_labels, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator modified after ContextFieldLabel created");
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
+    CeedCheck(num_sub == field_label->num_sub_labels, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+              "Composite operator modified after ContextFieldLabel created");
 
     for (CeedInt i = 0; i < num_sub; i++) {
-      CeedQFunction        qf;
       CeedQFunctionContext ctx;
 
-      CeedCall(CeedOperatorGetQFunction(sub_operators[i], &qf));
-      CeedCall(CeedQFunctionGetContext(qf, &ctx));
+      CeedCall(CeedOperatorGetContext(sub_operators[i], &ctx));
       // Try every sub-operator, ok if some sub-operators do not have field
-      if (field_label->sub_labels[i] && ctx) {
+      if (ctx && field_label->sub_labels[i]) {
         CeedCall(CeedQFunctionContextRestoreGenericRead(ctx, field_label->sub_labels[i], field_type, values));
+        CeedCall(CeedQFunctionContextDestroy(&ctx));
         return CEED_ERROR_SUCCESS;
       }
+      CeedCall(CeedQFunctionContextDestroy(&ctx));
     }
   } else {
-    CeedQFunction        qf;
     CeedQFunctionContext ctx;
 
-    CeedCall(CeedOperatorGetQFunction(op, &qf));
-    CeedCall(CeedQFunctionGetContext(qf, &ctx));
-    CeedCheck(ctx, ceed, CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
+    CeedCall(CeedOperatorGetContext(op, &ctx));
+    CeedCheck(ctx, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "QFunction does not have context data");
     CeedCall(CeedQFunctionContextRestoreGenericRead(ctx, field_label, field_type, values));
+    CeedCall(CeedQFunctionContextDestroy(&ctx));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -561,8 +614,9 @@ int CeedOperatorHasTensorBases(CeedOperator op, bool *has_tensor_bases) {
     CeedCall(CeedOperatorFieldGetBasis(input_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
       CeedCall(CeedBasisIsTensor(basis, &is_tensor));
-      *has_tensor_bases &= is_tensor;
+      *has_tensor_bases = *has_tensor_bases & is_tensor;
     }
+    CeedCall(CeedBasisDestroy(&basis));
   }
   for (CeedInt i = 0; i < num_outputs; i++) {
     bool      is_tensor;
@@ -571,8 +625,9 @@ int CeedOperatorHasTensorBases(CeedOperator op, bool *has_tensor_bases) {
     CeedCall(CeedOperatorFieldGetBasis(output_fields[i], &basis));
     if (basis != CEED_BASIS_NONE) {
       CeedCall(CeedBasisIsTensor(basis, &is_tensor));
-      *has_tensor_bases &= is_tensor;
+      *has_tensor_bases = *has_tensor_bases & is_tensor;
     }
+    CeedCall(CeedBasisDestroy(&basis));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -622,7 +677,8 @@ int CeedOperatorGetQFunction(CeedOperator op, CeedQFunction *qf) {
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Not defined for composite operator");
-  *qf = op->qf;
+  *qf = NULL;
+  CeedCall(CeedQFunctionReferenceCopy(op->qf, qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -681,7 +737,7 @@ int CeedOperatorSetData(CeedOperator op, void *data) {
   @ref Backend
 **/
 int CeedOperatorReference(CeedOperator op) {
-  op->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)op));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -727,22 +783,21 @@ int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunc
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorCreate");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedOperatorCreate");
     CeedCall(CeedOperatorCreate(delegate, qf, dqf, dqfT, op));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
   CeedCheck(qf && qf != CEED_QFUNCTION_NONE, ceed, CEED_ERROR_MINOR, "Operator must have a valid CeedQFunction.");
 
   CeedCall(CeedCalloc(1, op));
-  CeedCall(CeedReferenceCopy(ceed, &(*op)->ceed));
-  (*op)->ref_count   = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, CeedOperatorDestroy_Object, &(*op)->obj));
   (*op)->input_size  = -1;
   (*op)->output_size = -1;
   CeedCall(CeedQFunctionReferenceCopy(qf, &(*op)->qf));
   if (dqf && dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqf, &(*op)->dqf));
   if (dqfT && dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqfT, &(*op)->dqfT));
-  CeedCall(CeedQFunctionAssemblyDataCreate(ceed, &(*op)->qf_assembled));
   CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->input_fields));
   CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->output_fields));
   CeedCall(ceed->OperatorCreate(*op));
@@ -770,23 +825,22 @@ int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, C
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorCreateAtPoints");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedOperatorCreateAtPoints");
     CeedCall(CeedOperatorCreateAtPoints(delegate, qf, dqf, dqfT, op));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
   CeedCheck(qf && qf != CEED_QFUNCTION_NONE, ceed, CEED_ERROR_MINOR, "Operator must have a valid CeedQFunction.");
 
   CeedCall(CeedCalloc(1, op));
-  CeedCall(CeedReferenceCopy(ceed, &(*op)->ceed));
-  (*op)->ref_count    = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, CeedOperatorDestroy_Object, &(*op)->obj));
   (*op)->is_at_points = true;
   (*op)->input_size   = -1;
   (*op)->output_size  = -1;
   CeedCall(CeedQFunctionReferenceCopy(qf, &(*op)->qf));
   if (dqf && dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqf, &(*op)->dqf));
   if (dqfT && dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqfT, &(*op)->dqfT));
-  CeedCall(CeedQFunctionAssemblyDataCreate(ceed, &(*op)->qf_assembled));
   CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->input_fields));
   CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->output_fields));
   CeedCall(ceed->OperatorCreateAtPoints(*op));
@@ -803,20 +857,20 @@ int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, C
 
   @ref User
  */
-int CeedCompositeOperatorCreate(Ceed ceed, CeedOperator *op) {
+int CeedOperatorCreateComposite(Ceed ceed, CeedOperator *op) {
   if (!ceed->CompositeOperatorCreate) {
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator"));
     if (delegate) {
-      CeedCall(CeedCompositeOperatorCreate(delegate, op));
+      CeedCall(CeedOperatorCreateComposite(delegate, op));
+      CeedCall(CeedDestroy(&delegate));
       return CEED_ERROR_SUCCESS;
     }
   }
 
   CeedCall(CeedCalloc(1, op));
-  CeedCall(CeedReferenceCopy(ceed, &(*op)->ceed));
-  (*op)->ref_count    = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedOperatorView_Object, CeedOperatorDestroy_Object, &(*op)->obj));
   (*op)->is_composite = true;
   CeedCall(CeedCalloc(CEED_COMPOSITE_MAX, &(*op)->sub_operators));
   (*op)->input_size  = -1;
@@ -874,38 +928,38 @@ int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy) {
 int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction rstr, CeedBasis basis, CeedVector vec) {
   bool               is_input = true, is_at_points, is_composite, is_immutable;
   CeedInt            num_elem = 0, num_qpts = 0, num_input_fields, num_output_fields;
-  Ceed               ceed;
   CeedQFunction      qf;
   CeedQFunctionField qf_field, *qf_input_fields, *qf_output_fields;
   CeedOperatorField *op_field;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   CeedCall(CeedOperatorIsImmutable(op, &is_immutable));
-  CeedCheck(!is_composite, ceed, CEED_ERROR_INCOMPATIBLE, "Cannot add field to composite operator.");
-  CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable");
-  CeedCheck(rstr, ceed, CEED_ERROR_INCOMPATIBLE, "CeedElemRestriction rstr for field \"%s\" must be non-NULL.", field_name);
-  CeedCheck(basis, ceed, CEED_ERROR_INCOMPATIBLE, "CeedBasis basis for field \"%s\" must be non-NULL.", field_name);
-  CeedCheck(vec, ceed, CEED_ERROR_INCOMPATIBLE, "CeedVector vec for field \"%s\" must be non-NULL.", field_name);
+  CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "Cannot add field to composite operator.");
+  CeedCheck(!is_immutable, CeedOperatorReturnCeed(op), CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable");
+  CeedCheck(rstr, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "CeedElemRestriction rstr for field \"%s\" must be non-NULL.", field_name);
+  CeedCheck(basis, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "CeedBasis basis for field \"%s\" must be non-NULL.", field_name);
+  CeedCheck(vec, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "CeedVector vec for field \"%s\" must be non-NULL.", field_name);
 
   CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCheck(rstr == CEED_ELEMRESTRICTION_NONE || !op->has_restriction || num_elem == op->num_elem, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(rstr == CEED_ELEMRESTRICTION_NONE || !op->has_restriction || num_elem == op->num_elem, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION,
             "CeedElemRestriction with %" CeedInt_FMT " elements incompatible with prior %" CeedInt_FMT " elements", num_elem, op->num_elem);
   {
     CeedRestrictionType rstr_type;
 
     CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type));
     if (rstr_type == CEED_RESTRICTION_POINTS) {
-      CeedCheck(is_at_points, ceed, CEED_ERROR_UNSUPPORTED, "CeedElemRestriction AtPoints not supported for standard operator fields");
-      CeedCheck(basis == CEED_BASIS_NONE, ceed, CEED_ERROR_UNSUPPORTED, "CeedElemRestriction AtPoints must be used with CEED_BASIS_NONE");
+      CeedCheck(is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+                "CeedElemRestriction AtPoints not supported for standard operator fields");
+      CeedCheck(basis == CEED_BASIS_NONE, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+                "CeedElemRestriction AtPoints must be used with CEED_BASIS_NONE");
       if (!op->first_points_rstr) {
         CeedCall(CeedElemRestrictionReferenceCopy(rstr, &op->first_points_rstr));
       } else {
         bool are_compatible;
 
         CeedCall(CeedElemRestrictionAtPointsAreCompatible(op->first_points_rstr, rstr, &are_compatible));
-        CeedCheck(are_compatible, ceed, CEED_ERROR_INCOMPATIBLE,
+        CeedCheck(are_compatible, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE,
                   "CeedElemRestriction must have compatible offsets with previously set CeedElemRestriction");
       }
     }
@@ -913,13 +967,14 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestri
 
   if (basis == CEED_BASIS_NONE) CeedCall(CeedElemRestrictionGetElementSize(rstr, &num_qpts));
   else CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
-  CeedCheck(op->num_qpts == 0 || num_qpts == op->num_qpts, ceed, CEED_ERROR_DIMENSION,
+  CeedCheck(op->num_qpts == 0 || num_qpts == op->num_qpts, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION,
             "%s must correspond to the same number of quadrature points as previously added CeedBases. Found %" CeedInt_FMT
             " quadrature points but expected %" CeedInt_FMT " quadrature points.",
             basis == CEED_BASIS_NONE ? "CeedElemRestriction" : "CeedBasis", num_qpts, op->num_qpts);
 
   CeedCall(CeedOperatorGetQFunction(op, &qf));
   CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_input_fields, &num_output_fields, &qf_output_fields));
+  CeedCall(CeedQFunctionDestroy(&qf));
   for (CeedInt i = 0; i < num_input_fields; i++) {
     const char *qf_field_name;
 
@@ -942,10 +997,10 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestri
     }
   }
   // LCOV_EXCL_START
-  return CeedError(ceed, CEED_ERROR_INCOMPLETE, "CeedQFunction has no knowledge of field '%s'", field_name);
+  return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "CeedQFunction has no knowledge of field '%s'", field_name);
   // LCOV_EXCL_STOP
 found:
-  CeedCall(CeedOperatorCheckField(ceed, qf_field, rstr, basis));
+  CeedCall(CeedOperatorCheckField(CeedOperatorReturnCeed(op), qf_field, rstr, basis));
   CeedCall(CeedCalloc(1, op_field));
 
   if (vec == CEED_VECTOR_ACTIVE) {
@@ -954,11 +1009,11 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestri
     CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
     if (is_input) {
       if (op->input_size == -1) op->input_size = l_size;
-      CeedCheck(l_size == op->input_size, ceed, CEED_ERROR_INCOMPATIBLE,
+      CeedCheck(l_size == op->input_size, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE,
                 "LVector size %" CeedSize_FMT " does not match previous size %" CeedSize_FMT "", l_size, op->input_size);
     } else {
       if (op->output_size == -1) op->output_size = l_size;
-      CeedCheck(l_size == op->output_size, ceed, CEED_ERROR_INCOMPATIBLE,
+      CeedCheck(l_size == op->output_size, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE,
                 "LVector size %" CeedSize_FMT " does not match previous size %" CeedSize_FMT "", l_size, op->output_size);
     }
   }
@@ -1002,6 +1057,7 @@ int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperat
 
   CeedCall(CeedOperatorGetQFunction(op, &qf));
   CeedCall(CeedQFunctionGetFields(qf, num_input_fields, NULL, num_output_fields, NULL));
+  CeedCall(CeedQFunctionDestroy(&qf));
   if (input_fields) *input_fields = op->input_fields;
   if (output_fields) *output_fields = op->output_fields;
   return CEED_ERROR_SUCCESS;
@@ -1022,13 +1078,11 @@ int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperat
 **/
 int CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_points, CeedVector point_coords) {
   bool is_at_points, is_immutable;
-  Ceed ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   CeedCall(CeedOperatorIsImmutable(op, &is_immutable));
-  CeedCheck(is_at_points, ceed, CEED_ERROR_MINOR, "Only defined for operator at points");
-  CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable");
+  CeedCheck(is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for operator at points");
+  CeedCheck(!is_immutable, CeedOperatorReturnCeed(op), CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable");
 
   if (!op->first_points_rstr) {
     CeedCall(CeedElemRestrictionReferenceCopy(rstr_points, &op->first_points_rstr));
@@ -1036,7 +1090,7 @@ int CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_poin
     bool are_compatible;
 
     CeedCall(CeedElemRestrictionAtPointsAreCompatible(op->first_points_rstr, rstr_points, &are_compatible));
-    CeedCheck(are_compatible, ceed, CEED_ERROR_INCOMPATIBLE,
+    CeedCheck(are_compatible, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE,
               "CeedElemRestriction must have compatible offsets with previously set field CeedElemRestriction");
   }
 
@@ -1047,10 +1101,10 @@ int CeedOperatorAtPointsSetPoints(CeedOperator op, CeedElemRestriction rstr_poin
 
 /**
   @brief Get a boolean value indicating if the `CeedOperator` was created with `CeedOperatorCreateAtPoints`
-    
+
   @param[in]  op           `CeedOperator`
   @param[out] is_at_points Variable to store at points status
-  
+
   @return An error code: 0 - success, otherwise - failure
 
   @ref User
@@ -1080,8 +1134,14 @@ int CeedOperatorAtPointsGetPoints(CeedOperator op, CeedElemRestriction *rstr_poi
   CeedCheck(is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for operator at points");
   CeedCall(CeedOperatorCheckReady(op));
 
-  if (rstr_points) CeedCall(CeedElemRestrictionReferenceCopy(op->rstr_points, rstr_points));
-  if (point_coords) CeedCall(CeedVectorReferenceCopy(op->point_coords, point_coords));
+  if (rstr_points) {
+    *rstr_points = NULL;
+    CeedCall(CeedElemRestrictionReferenceCopy(op->rstr_points, rstr_points));
+  }
+  if (point_coords) {
+    *point_coords = NULL;
+    CeedCall(CeedVectorReferenceCopy(op->point_coords, point_coords));
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1140,7 +1200,9 @@ int CeedOperatorFieldGetName(CeedOperatorField op_field, const char **field_name
 }
 
 /**
-  @brief Get the `CeedElemRestriction` of a `CeedOperator` Field
+  @brief Get the `CeedElemRestriction` of a `CeedOperator` Field.
+
+  Note: Caller is responsible for destroying the `rstr` with @ref CeedElemRestrictionDestroy().
 
   @param[in]  op_field `CeedOperator` Field
   @param[out] rstr     Variable to store `CeedElemRestriction`
@@ -1150,12 +1212,15 @@ int CeedOperatorFieldGetName(CeedOperatorField op_field, const char **field_name
   @ref Advanced
 **/
 int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, CeedElemRestriction *rstr) {
-  *rstr = op_field->elem_rstr;
+  *rstr = NULL;
+  CeedCall(CeedElemRestrictionReferenceCopy(op_field->elem_rstr, rstr));
   return CEED_ERROR_SUCCESS;
 }
 
 /**
-  @brief Get the `CeedBasis` of a `CeedOperator` Field
+  @brief Get the `CeedBasis` of a `CeedOperator` Field.
+
+  Note: Caller is responsible for destroying the `basis` with @ref CeedBasisDestroy().
 
   @param[in]  op_field `CeedOperator` Field
   @param[out] basis    Variable to store `CeedBasis`
@@ -1165,12 +1230,15 @@ int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, CeedElemRest
   @ref Advanced
 **/
 int CeedOperatorFieldGetBasis(CeedOperatorField op_field, CeedBasis *basis) {
-  *basis = op_field->basis;
+  *basis = NULL;
+  CeedCall(CeedBasisReferenceCopy(op_field->basis, basis));
   return CEED_ERROR_SUCCESS;
 }
 
 /**
-  @brief Get the `CeedVector` of a `CeedOperator` Field
+  @brief Get the `CeedVector` of a `CeedOperator` Field.
+
+  Note: Caller is responsible for destroying the `vec` with @ref CeedVectorDestroy().
 
   @param[in]  op_field `CeedOperator` Field
   @param[out] vec      Variable to store `CeedVector`
@@ -1180,14 +1248,17 @@ int CeedOperatorFieldGetBasis(CeedOperatorField op_field, CeedBasis *basis) {
   @ref Advanced
 **/
 int CeedOperatorFieldGetVector(CeedOperatorField op_field, CeedVector *vec) {
-  *vec = op_field->vec;
+  *vec = NULL;
+  CeedCall(CeedVectorReferenceCopy(op_field->vec, vec));
   return CEED_ERROR_SUCCESS;
 }
 
 /**
   @brief Get the data of a `CeedOperator` Field.
 
-  Any arguments set as `NULL` are ignored.
+  Any arguments set as `NULL` are ignored..
+
+  Note: Caller is responsible for destroying the `rstr`, `basis`, and `vec`.
 
   @param[in]  op_field   `CeedOperator` Field
   @param[out] field_name Variable to store the field name
@@ -1217,15 +1288,14 @@ int CeedOperatorFieldGetData(CeedOperatorField op_field, const char **field_name
 
   @ref User
  */
-int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op) {
+int CeedOperatorCompositeAddSub(CeedOperator composite_op, CeedOperator sub_op) {
   bool is_immutable;
-  Ceed ceed;
 
-  CeedCall(CeedOperatorGetCeed(composite_op, &ceed));
-  CeedCheck(composite_op->is_composite, ceed, CEED_ERROR_MINOR, "CeedOperator is not a composite operator");
-  CeedCheck(composite_op->num_suboperators < CEED_COMPOSITE_MAX, ceed, CEED_ERROR_UNSUPPORTED, "Cannot add additional sub-operators");
+  CeedCheck(composite_op->is_composite, CeedOperatorReturnCeed(composite_op), CEED_ERROR_MINOR, "CeedOperator is not a composite operator");
+  CeedCheck(composite_op->num_suboperators < CEED_COMPOSITE_MAX, CeedOperatorReturnCeed(composite_op), CEED_ERROR_UNSUPPORTED,
+            "Cannot add additional sub-operators");
   CeedCall(CeedOperatorIsImmutable(composite_op, &is_immutable));
-  CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable");
+  CeedCheck(!is_immutable, CeedOperatorReturnCeed(composite_op), CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable");
 
   {
     CeedSize input_size, output_size;
@@ -1234,8 +1304,8 @@ int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op)
     if (composite_op->input_size == -1) composite_op->input_size = input_size;
     if (composite_op->output_size == -1) composite_op->output_size = output_size;
     // Note, a size of -1 means no active vector restriction set, so no incompatibility
-    CeedCheck((input_size == -1 || input_size == composite_op->input_size) && (output_size == -1 || output_size == composite_op->output_size), ceed,
-              CEED_ERROR_MAJOR,
+    CeedCheck((input_size == -1 || input_size == composite_op->input_size) && (output_size == -1 || output_size == composite_op->output_size),
+              CeedOperatorReturnCeed(composite_op), CEED_ERROR_MAJOR,
               "Sub-operators must have compatible dimensions; composite operator of shape (%" CeedSize_FMT ", %" CeedSize_FMT
               ") not compatible with sub-operator of "
               "shape (%" CeedSize_FMT ", %" CeedSize_FMT ")",
@@ -1258,7 +1328,7 @@ int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op)
 
   @ref Backend
 **/
-int CeedCompositeOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators) {
+int CeedOperatorCompositeGetNumSub(CeedOperator op, CeedInt *num_suboperators) {
   bool is_composite;
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -1277,7 +1347,7 @@ int CeedCompositeOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators) {
 
   @ref Backend
 **/
-int CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operators) {
+int CeedOperatorCompositeGetSubList(CeedOperator op, CeedOperator **sub_operators) {
   bool is_composite;
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -1286,6 +1356,82 @@ int CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operator
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get a sub `CeedOperator` of a composite `CeedOperator` from its name.
+
+  `sub_op` is set to `NULL` if the sub operator is not found.
+
+  Note: Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable.
+
+  @param[in]  op      Composite `CeedOperator`
+  @param[in]  op_name Name of desired sub `CeedOperator`
+  @param[out] sub_op  Sub `CeedOperator` corresponding to the name
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Advanced
+**/
+int CeedOperatorCompositeGetSubByName(CeedOperator op, const char *op_name, CeedOperator *sub_op) {
+  bool          is_composite;
+  CeedInt       num_sub_ops;
+  CeedOperator *sub_ops;
+
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  CeedCheck(is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for a composite operator");
+  *sub_op = NULL;
+  CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub_ops));
+  CeedCall(CeedOperatorCompositeGetSubList(op, &sub_ops));
+  for (CeedInt i = 0; i < num_sub_ops; i++) {
+    if (sub_ops[i]->name && !strcmp(op_name, sub_ops[i]->name)) {
+      *sub_op = sub_ops[i];
+      return CEED_ERROR_SUCCESS;
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Set whether the sub-operators of the composite `CeedOperator` must be run sequentially.
+
+  Note: This value currently only affects the GPU `/gpu/cuda/gen` and `/gpu/hip/gen` backends.
+
+  @param[in] op            Composite `CeedOperator`
+  @param[in] is_sequential Flag value to set, if `true`, forces the composite `CeedOperator` to execute sequentially
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Advanced
+**/
+int CeedOperatorCompositeSetSequential(CeedOperator op, bool is_sequential) {
+  bool is_composite;
+
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  CeedCheck(is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for a composite operator");
+  op->is_sequential = is_sequential;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get whether the sub-operators of the composite `CeedOperator` must be run sequentially.
+
+  Note: This value currently only affects the GPU `/gpu/cuda/gen` and `/gpu/hip/gen` backends.
+
+  @param[in]  op            Composite `CeedOperator`
+  @param[out] is_sequential Variable to store sequential status
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Advanced
+**/
+int CeedOperatorCompositeIsSequential(CeedOperator op, bool *is_sequential) {
+  bool is_composite;
+
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  CeedCheck(is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for a composite operator");
+  *is_sequential = op->is_sequential;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Check if a `CeedOperator` is ready to be used.
 
@@ -1297,19 +1443,17 @@ int CeedCompositeOperatorGetSubList(CeedOperator op, CeedOperator **sub_operator
 **/
 int CeedOperatorCheckReady(CeedOperator op) {
   bool          is_at_points, is_composite;
-  Ceed          ceed;
   CeedQFunction qf = NULL;
 
   if (op->is_interface_setup) return CEED_ERROR_SUCCESS;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
   if (!is_composite) CeedCall(CeedOperatorGetQFunction(op, &qf));
   if (is_composite) {
     CeedInt num_suboperators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
     if (!num_suboperators) {
       // Empty operator setup
       op->input_size  = 0;
@@ -1317,7 +1461,7 @@ int CeedOperatorCheckReady(CeedOperator op) {
     } else {
       CeedOperator *sub_operators;
 
-      CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+      CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
       for (CeedInt i = 0; i < num_suboperators; i++) {
         CeedCall(CeedOperatorCheckReady(sub_operators[i]));
       }
@@ -1329,17 +1473,19 @@ int CeedOperatorCheckReady(CeedOperator op) {
   } else {
     CeedInt num_input_fields, num_output_fields;
 
-    CeedCheck(op->num_fields > 0, ceed, CEED_ERROR_INCOMPLETE, "No operator fields set");
+    CeedCheck(op->num_fields > 0, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "No operator fields set");
     CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, NULL, &num_output_fields, NULL));
-    CeedCheck(op->num_fields == num_input_fields + num_output_fields, ceed, CEED_ERROR_INCOMPLETE, "Not all operator fields set");
-    CeedCheck(op->has_restriction, ceed, CEED_ERROR_INCOMPLETE, "At least one restriction required");
-    CeedCheck(op->num_qpts > 0 || is_at_points, ceed, CEED_ERROR_INCOMPLETE,
+    CeedCheck(op->num_fields == num_input_fields + num_output_fields, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE,
+              "Not all operator fields set");
+    CeedCheck(op->has_restriction, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE, "At least one restriction required");
+    CeedCheck(op->num_qpts > 0 || is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE,
               "At least one non-collocated CeedBasis is required or the number of quadrature points must be set");
   }
 
   // Flag as immutable and ready
   op->is_interface_setup = true;
   if (qf && qf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionSetImmutable(qf));
+  CeedCall(CeedQFunctionDestroy(&qf));
   if (op->dqf && op->dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionSetImmutable(op->dqf));
   if (op->dqfT && op->dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionSetImmutable(op->dqfT));
   return CEED_ERROR_SUCCESS;
@@ -1369,8 +1515,8 @@ int CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, Ce
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     for (CeedInt i = 0; i < num_suboperators; i++) {
       CeedSize sub_input_size, sub_output_size;
 
@@ -1411,7 +1557,10 @@ int CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_d
       CeedCall(CeedOperatorSetQFunctionAssemblyReuse(op->sub_operators[i], reuse_assembly_data));
     }
   } else {
-    CeedCall(CeedQFunctionAssemblyDataSetReuse(op->qf_assembled, reuse_assembly_data));
+    CeedQFunctionAssemblyData data;
+
+    CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data));
+    CeedCall(CeedQFunctionAssemblyDataSetReuse(data, reuse_assembly_data));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1434,13 +1583,16 @@ int CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     for (CeedInt i = 0; i < num_suboperators; i++) {
       CeedCall(CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(sub_operators[i], needs_data_update));
     }
   } else {
-    CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(op->qf_assembled, needs_data_update));
+    CeedQFunctionAssemblyData data;
+
+    CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data));
+    CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(data, needs_data_update));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1469,36 +1621,135 @@ int CeedOperatorSetName(CeedOperator op, const char *name) {
 }
 
 /**
-  @brief View a `CeedOperator`
+  @brief Get name of `CeedOperator`
 
-  @param[in] op     `CeedOperator` to view
-  @param[in] stream Stream to write; typically `stdout` or a file
+  @param[in]     op   `CeedOperator`
+  @param[in,out] name Address of variable to hold currently set name
 
-  @return Error code: 0 - success, otherwise - failure
+  @return An error code: 0 - success, otherwise - failure
 
   @ref User
 **/
-int CeedOperatorView(CeedOperator op, FILE *stream) {
-  bool has_name = op->name, is_composite;
+int CeedOperatorGetName(CeedOperator op, const char **name) {
+  if (op->name) {
+    *name = op->name;
+  } else if (!op->is_composite) {
+    CeedQFunction qf;
+
+    CeedCall(CeedOperatorGetQFunction(op, &qf));
+    if (qf) CeedCall(CeedQFunctionGetName(qf, name));
+    CeedCall(CeedQFunctionDestroy(&qf));
+  }
+  return CEED_ERROR_SUCCESS;
+}
 
+/**
+  @brief Core logic for viewing a `CeedOperator`
+
+  @param[in] op     `CeedOperator` to view brief summary
+  @param[in] stream  Stream to write; typically `stdout` or a file
+  @param[in] is_full Whether to write full operator view or terse
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedOperatorView_Core(CeedOperator op, FILE *stream, bool is_full) {
+  bool        has_name, is_composite, is_at_points;
+  char       *tabs     = NULL;
+  const char *name     = NULL;
+  CeedInt     num_tabs = 0;
+
+  CeedCall(CeedOperatorGetName(op, &name));
+  has_name = name ? strlen(name) : false;
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+  // Set tabs
+  CeedCall(CeedOperatorGetNumViewTabs(op, &num_tabs));
+  CeedCall(CeedCalloc(CEED_TAB_WIDTH * (num_tabs + is_composite) + 1, &tabs));
+  for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
   if (is_composite) {
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-    fprintf(stream, "Composite CeedOperator%s%s\n", has_name ? " - " : "", has_name ? op->name : "");
-
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
+    fprintf(stream, "%s", tabs);
+    fprintf(stream, "Composite CeedOperator%s%s\n", has_name ? " - " : "", has_name ? name : "");
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH; i++) tabs[CEED_TAB_WIDTH * num_tabs + i] = ' ';
     for (CeedInt i = 0; i < num_suboperators; i++) {
       has_name = sub_operators[i]->name;
-      fprintf(stream, "  SubOperator %" CeedInt_FMT "%s%s:\n", i, has_name ? " - " : "", has_name ? sub_operators[i]->name : "");
-      CeedCall(CeedOperatorSingleView(sub_operators[i], 1, stream));
+      fprintf(stream, "%s", tabs);
+      fprintf(stream, "SubOperator%s %" CeedInt_FMT "%s%s%s\n", is_at_points ? " AtPoints" : "", i, has_name ? " - " : "",
+              has_name ? sub_operators[i]->name : "", is_full ? ":" : "");
+      if (is_full) CeedCall(CeedOperatorSingleView(sub_operators[i], tabs, stream));
     }
   } else {
-    fprintf(stream, "CeedOperator%s%s\n", has_name ? " - " : "", has_name ? op->name : "");
-    CeedCall(CeedOperatorSingleView(op, 0, stream));
+    fprintf(stream, "%s", tabs);
+    fprintf(stream, "CeedOperator%s%s%s\n", is_at_points ? " AtPoints" : "", has_name ? " - " : "", has_name ? name : "");
+    if (is_full) CeedCall(CeedOperatorSingleView(op, tabs, stream));
   }
+  CeedCall(CeedFree(&tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Set the number of tabs to indent for @ref CeedOperatorView() output
+
+  @param[in] op       `CeedOperator` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorSetNumViewTabs(CeedOperator op, CeedInt num_tabs) {
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)op, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get the number of tabs to indent for @ref CeedOperatorView() output
+
+  @param[in]  op       `CeedOperator` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorGetNumViewTabs(CeedOperator op, CeedInt *num_tabs) {
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)op, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief View a `CeedOperator`
+
+  @param[in] op     `CeedOperator` to view
+  @param[in] stream Stream to write; typically `stdout` or a file
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorView(CeedOperator op, FILE *stream) {
+  CeedCall(CeedOperatorView_Core(op, stream, true));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief View a brief summary `CeedOperator`
+
+  @param[in] op     `CeedOperator` to view brief summary
+  @param[in] stream Stream to write; typically `stdout` or a file
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorViewTerse(CeedOperator op, FILE *stream) {
+  CeedCall(CeedOperatorView_Core(op, stream, false));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1513,7 +1764,7 @@ int CeedOperatorView(CeedOperator op, FILE *stream) {
   @ref Advanced
 **/
 int CeedOperatorGetCeed(CeedOperator op, Ceed *ceed) {
-  *ceed = CeedOperatorReturnCeed(op);
+  CeedCall(CeedObjectGetCeed((CeedObject)op, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1526,7 +1777,7 @@ int CeedOperatorGetCeed(CeedOperator op, Ceed *ceed) {
 
   @ref Advanced
 **/
-Ceed CeedOperatorReturnCeed(CeedOperator op) { return op->ceed; }
+Ceed CeedOperatorReturnCeed(CeedOperator op) { return CeedObjectReturnCeed((CeedObject)op); }
 
 /**
   @brief Get the number of elements associated with a `CeedOperator`
@@ -1584,9 +1835,9 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
   if (is_composite) {
     CeedInt num_suboperators;
 
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
     CeedOperator *sub_operators;
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
 
     // FLOPs for each suboperator
     for (CeedInt i = 0; i < num_suboperators; i++) {
@@ -1596,15 +1847,40 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
       *flops += suboperator_flops;
     }
   } else {
-    CeedInt             num_input_fields, num_output_fields, num_elem = 0;
+    bool                is_at_points;
+    CeedInt             num_input_fields, num_output_fields, num_elem = 0, num_points = 0;
     CeedQFunction       qf;
     CeedQFunctionField *qf_input_fields, *qf_output_fields;
     CeedOperatorField  *op_input_fields, *op_output_fields;
 
+    CeedCall(CeedOperatorGetNumElements(op, &num_elem));
+    if (num_elem == 0) return CEED_ERROR_SUCCESS;
+    CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+    if (is_at_points) {
+      CeedMemType         mem_type;
+      CeedElemRestriction rstr_points = NULL;
+
+      CeedCall(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+      CeedCall(CeedGetPreferredMemType(CeedOperatorReturnCeed(op), &mem_type));
+      if (mem_type == CEED_MEM_DEVICE) {
+        // Device backends pad out to the same number of points per element
+        CeedCall(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &num_points));
+      } else {
+        num_points = 0;
+        for (CeedInt i = 0; i < num_elem; i++) {
+          CeedInt points_in_elem = 0;
+
+          CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr_points, i, &points_in_elem));
+          num_points += points_in_elem;
+        }
+        num_points = num_points / num_elem + (num_points % num_elem > 0);
+      }
+      CeedCall(CeedElemRestrictionDestroy(&rstr_points));
+    }
     CeedCall(CeedOperatorGetQFunction(op, &qf));
     CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_input_fields, &num_output_fields, &qf_output_fields));
+    CeedCall(CeedQFunctionDestroy(&qf));
     CeedCall(CeedOperatorGetFields(op, NULL, &op_input_fields, NULL, &op_output_fields));
-    CeedCall(CeedOperatorGetNumElements(op, &num_elem));
 
     // Input FLOPs
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -1619,12 +1895,15 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
 
         CeedCall(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr));
         CeedCall(CeedElemRestrictionGetFlopsEstimate(rstr, CEED_NOTRANSPOSE, &rstr_flops));
+        CeedCall(CeedElemRestrictionDestroy(&rstr));
         *flops += rstr_flops;
         CeedCall(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
         CeedCall(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-        CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_NOTRANSPOSE, eval_mode, &basis_flops));
+        CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_NOTRANSPOSE, eval_mode, is_at_points, num_points, &basis_flops));
+        CeedCall(CeedBasisDestroy(&basis));
         *flops += basis_flops * num_elem;
       }
+      CeedCall(CeedVectorDestroy(&vec));
     }
     // QF FLOPs
     {
@@ -1632,9 +1911,11 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
       CeedSize      qf_flops;
       CeedQFunction qf;
 
-      CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts));
+      if (is_at_points) num_qpts = num_points;
+      else CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts));
       CeedCall(CeedOperatorGetQFunction(op, &qf));
       CeedCall(CeedQFunctionGetFlopsEstimate(qf, &qf_flops));
+      CeedCall(CeedQFunctionDestroy(&qf));
       CeedCheck(qf_flops > -1, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPLETE,
                 "Must set CeedQFunction FLOPs estimate with CeedQFunctionSetUserFlopsEstimate");
       *flops += num_elem * num_qpts * qf_flops;
@@ -1653,12 +1934,15 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
 
         CeedCall(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr));
         CeedCall(CeedElemRestrictionGetFlopsEstimate(rstr, CEED_TRANSPOSE, &rstr_flops));
+        CeedCall(CeedElemRestrictionDestroy(&rstr));
         *flops += rstr_flops;
         CeedCall(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
         CeedCall(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-        CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_TRANSPOSE, eval_mode, &basis_flops));
+        CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_TRANSPOSE, eval_mode, is_at_points, num_points, &basis_flops));
+        CeedCall(CeedBasisDestroy(&basis));
         *flops += basis_flops * num_elem;
       }
+      CeedCall(CeedVectorDestroy(&vec));
     }
   }
   return CEED_ERROR_SUCCESS;
@@ -1688,6 +1972,8 @@ int CeedOperatorGetContext(CeedOperator op, CeedQFunctionContext *ctx) {
   CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE, "Cannot retrieve CeedQFunctionContext for composite operator");
   CeedCall(CeedOperatorGetQFunction(op, &qf));
   CeedCall(CeedQFunctionGetInnerContext(qf, &qf_ctx));
+  CeedCall(CeedQFunctionDestroy(&qf));
+  *ctx = NULL;
   if (qf_ctx) CeedCall(CeedQFunctionContextReferenceCopy(qf_ctx, ctx));
   return CEED_ERROR_SUCCESS;
 }
@@ -1726,8 +2012,8 @@ int CeedOperatorGetContextFieldLabel(CeedOperator op, const char *field_name, Ce
     CeedContextFieldLabel new_field_label;
 
     CeedCall(CeedCalloc(1, &new_field_label));
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     CeedCall(CeedCalloc(num_sub, &new_field_label->sub_labels));
     new_field_label->num_sub_labels = num_sub;
 
@@ -1780,6 +2066,7 @@ int CeedOperatorGetContextFieldLabel(CeedOperator op, const char *field_name, Ce
     // Single, non-composite operator
     CeedCall(CeedOperatorGetQFunction(op, &qf));
     CeedCall(CeedQFunctionGetInnerContext(qf, &ctx));
+    CeedCall(CeedQFunctionDestroy(&qf));
     if (ctx) {
       CeedCall(CeedQFunctionContextGetFieldLabel(ctx, field_name, field_label));
     } else {
@@ -1961,7 +2248,7 @@ int CeedOperatorRestoreContextBooleanRead(CeedOperator op, CeedContextFieldLabel
   This computes the action of the operator on the specified (active) input, yielding its (active) output.
   All inputs and outputs must be specified using @ref CeedOperatorSetField().
 
-  Note: Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable.
+  @note Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable.
 
   @param[in]  op      `CeedOperator` to apply
   @param[in]  in      `CeedVector` containing input state or @ref CEED_VECTOR_NONE if there are no active inputs
@@ -1978,58 +2265,19 @@ int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedReques
   CeedCall(CeedOperatorCheckReady(op));
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
-  if (is_composite) {
+  if (is_composite && op->ApplyComposite) {
     // Composite Operator
-    if (op->ApplyComposite) {
-      CeedCall(op->ApplyComposite(op, in, out, request));
-    } else {
-      CeedInt       num_suboperators;
-      CeedOperator *sub_operators;
-
-      CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-      CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-
-      // Zero all output vectors
-      if (out != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(out, 0.0));
-      for (CeedInt i = 0; i < num_suboperators; i++) {
-        CeedInt            num_output_fields;
-        CeedOperatorField *output_fields;
-
-        CeedCall(CeedOperatorGetFields(sub_operators[i], NULL, NULL, &num_output_fields, &output_fields));
-        for (CeedInt j = 0; j < num_output_fields; j++) {
-          CeedVector vec;
-
-          CeedCall(CeedOperatorFieldGetVector(output_fields[j], &vec));
-          if (vec != CEED_VECTOR_ACTIVE && vec != CEED_VECTOR_NONE) {
-            CeedCall(CeedVectorSetValue(vec, 0.0));
-          }
-        }
-      }
-      // Apply
-      for (CeedInt i = 0; i < num_suboperators; i++) {
-        CeedCall(CeedOperatorApplyAdd(sub_operators[i], in, out, request));
-      }
-    }
-  } else {
+    CeedCall(op->ApplyComposite(op, in, out, request));
+  } else if (!is_composite && op->Apply) {
     // Standard Operator
-    if (op->Apply) {
-      CeedCall(op->Apply(op, in, out, request));
-    } else {
-      CeedInt            num_output_fields;
-      CeedOperatorField *output_fields;
-
-      CeedCall(CeedOperatorGetFields(op, NULL, NULL, &num_output_fields, &output_fields));
-      // Zero all output vectors
-      for (CeedInt i = 0; i < num_output_fields; i++) {
-        CeedVector vec;
+    CeedCall(op->Apply(op, in, out, request));
+  } else {
+    // Standard or composite, default to zeroing out and calling ApplyAddActive
+    // Zero active output
+    if (out != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(out, 0.0));
 
-        CeedCall(CeedOperatorFieldGetVector(output_fields[i], &vec));
-        if (vec == CEED_VECTOR_ACTIVE) vec = out;
-        if (vec != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(vec, 0.0));
-      }
-      // Apply
-      if (op->num_elem > 0) CeedCall(op->ApplyAdd(op, in, out, request));
-    }
+    // ApplyAddActive
+    CeedCall(CeedOperatorApplyAddActive(op, in, out, request));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -2040,6 +2288,10 @@ int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedReques
   This computes the action of the operator on the specified (active) input, yielding its (active) output.
   All inputs and outputs must be specified using @ref CeedOperatorSetField().
 
+  @note Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable.
+  @warning This function adds into ALL outputs, including passive outputs. To only add into the active output, use `CeedOperatorApplyAddActive()`.
+  @see `CeedOperatorApplyAddActive()`
+
   @param[in]  op      `CeedOperator` to apply
   @param[in]  in      `CeedVector` containing input state or @ref CEED_VECTOR_NONE if there are no active inputs
   @param[out] out     `CeedVector` to sum in result of applying operator (must be distinct from `in`) or @ref CEED_VECTOR_NONE if there are no active outputs
@@ -2063,8 +2315,8 @@ int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedReq
       CeedInt       num_suboperators;
       CeedOperator *sub_operators;
 
-      CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-      CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+      CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+      CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
       for (CeedInt i = 0; i < num_suboperators; i++) {
         CeedCall(CeedOperatorApplyAdd(sub_operators[i], in, out, request));
       }
@@ -2076,6 +2328,102 @@ int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedReq
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Apply `CeedOperator` to a `CeedVector` and add result to output `CeedVector`. Only sums into active outputs, overwrites passive outputs.
+
+  This computes the action of the operator on the specified (active) input, yielding its (active) output.
+  All inputs and outputs must be specified using @ref CeedOperatorSetField().
+
+  @note Calling this function asserts that setup is complete and sets the `CeedOperator` as immutable.
+
+  @param[in]  op      `CeedOperator` to apply
+  @param[in]  in      `CeedVector` containing input state or @ref CEED_VECTOR_NONE if there are no active inputs
+  @param[out] out     `CeedVector` to sum in result of applying operator (must be distinct from `in`) or @ref CEED_VECTOR_NONE if there are no active outputs
+  @param[in]  request Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorApplyAddActive(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request) {
+  bool is_composite;
+
+  CeedCall(CeedOperatorCheckReady(op));
+
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  if (is_composite) {
+    // Composite Operator
+    CeedInt       num_suboperators;
+    CeedOperator *sub_operators;
+
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
+
+    // Zero all output vectors
+    for (CeedInt i = 0; i < num_suboperators; i++) {
+      CeedInt            num_output_fields;
+      CeedOperatorField *output_fields;
+
+      CeedCall(CeedOperatorGetFields(sub_operators[i], NULL, NULL, &num_output_fields, &output_fields));
+      for (CeedInt j = 0; j < num_output_fields; j++) {
+        CeedVector vec;
+
+        CeedCall(CeedOperatorFieldGetVector(output_fields[j], &vec));
+        if (vec != CEED_VECTOR_ACTIVE && vec != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(vec, 0.0));
+        CeedCall(CeedVectorDestroy(&vec));
+      }
+    }
+    // ApplyAdd
+    CeedCall(CeedOperatorApplyAdd(op, in, out, request));
+  } else {
+    // Standard Operator
+    CeedInt            num_output_fields;
+    CeedOperatorField *output_fields;
+
+    CeedCall(CeedOperatorGetFields(op, NULL, NULL, &num_output_fields, &output_fields));
+    // Zero all output vectors
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedVector vec;
+
+      CeedCall(CeedOperatorFieldGetVector(output_fields[i], &vec));
+      if (vec != CEED_VECTOR_ACTIVE && vec != CEED_VECTOR_NONE) CeedCall(CeedVectorSetValue(vec, 0.0));
+      CeedCall(CeedVectorDestroy(&vec));
+    }
+    // ApplyAdd
+    CeedCall(CeedOperatorApplyAdd(op, in, out, request));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Destroy temporary assembly data associated with a `CeedOperator`
+
+  @param[in,out] op `CeedOperator` whose assembly data to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorAssemblyDataStrip(CeedOperator op) {
+  bool is_composite;
+
+  CeedCall(CeedQFunctionAssemblyDataDestroy(&op->qf_assembled));
+  CeedCall(CeedOperatorAssemblyDataDestroy(&op->op_assembled));
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  if (is_composite) {
+    CeedInt       num_suboperators;
+    CeedOperator *sub_operators;
+
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
+    for (CeedInt i = 0; i < num_suboperators; i++) {
+      CeedCall(CeedQFunctionAssemblyDataDestroy(&sub_operators[i]->qf_assembled));
+      CeedCall(CeedOperatorAssemblyDataDestroy(&sub_operators[i]->op_assembled));
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Destroy a `CeedOperator`
 
@@ -2086,12 +2434,14 @@ int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedReq
   @ref User
 **/
 int CeedOperatorDestroy(CeedOperator *op) {
-  if (!*op || --(*op)->ref_count > 0) {
+  if (!*op || CeedObjectDereference((CeedObject)*op) > 0) {
     *op = NULL;
     return CEED_ERROR_SUCCESS;
   }
-  if ((*op)->Destroy) CeedCall((*op)->Destroy(*op));
-  CeedCall(CeedDestroy(&(*op)->ceed));
+  // Backend destroy
+  if ((*op)->Destroy) {
+    CeedCall((*op)->Destroy(*op));
+  }
   // Free fields
   for (CeedInt i = 0; i < (*op)->num_fields; i++) {
     if ((*op)->input_fields[i]) {
@@ -2121,16 +2471,21 @@ int CeedOperatorDestroy(CeedOperator *op) {
       CeedCall(CeedFree(&(*op)->output_fields[i]));
     }
   }
-  // AtPoints data
+  CeedCall(CeedFree(&(*op)->input_fields));
+  CeedCall(CeedFree(&(*op)->output_fields));
+  // Destroy AtPoints data
   CeedCall(CeedVectorDestroy(&(*op)->point_coords));
   CeedCall(CeedElemRestrictionDestroy(&(*op)->rstr_points));
   CeedCall(CeedElemRestrictionDestroy(&(*op)->first_points_rstr));
+  // Destroy assembly data (must happen before destroying sub_operators)
+  CeedCall(CeedOperatorAssemblyDataStrip(*op));
   // Destroy sub_operators
   for (CeedInt i = 0; i < (*op)->num_suboperators; i++) {
     if ((*op)->sub_operators[i]) {
       CeedCall(CeedOperatorDestroy(&(*op)->sub_operators[i]));
     }
   }
+  CeedCall(CeedFree(&(*op)->sub_operators));
   CeedCall(CeedQFunctionDestroy(&(*op)->qf));
   CeedCall(CeedQFunctionDestroy(&(*op)->dqf));
   CeedCall(CeedQFunctionDestroy(&(*op)->dqfT));
@@ -2146,14 +2501,8 @@ int CeedOperatorDestroy(CeedOperator *op) {
   // Destroy fallback
   CeedCall(CeedOperatorDestroy(&(*op)->op_fallback));
 
-  // Destroy assembly data
-  CeedCall(CeedQFunctionAssemblyDataDestroy(&(*op)->qf_assembled));
-  CeedCall(CeedOperatorAssemblyDataDestroy(&(*op)->op_assembled));
-
-  CeedCall(CeedFree(&(*op)->input_fields));
-  CeedCall(CeedFree(&(*op)->output_fields));
-  CeedCall(CeedFree(&(*op)->sub_operators));
   CeedCall(CeedFree(&(*op)->name));
+  CeedCall(CeedObjectDestroy_Private(&(*op)->obj));
   CeedCall(CeedFree(op));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 88255ced0a..dd83302f2d 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -37,22 +37,22 @@
 static int CeedQFunctionCreateFallback(Ceed fallback_ceed, CeedQFunction qf, CeedQFunction *qf_fallback) {
   char               *source_path_with_name = NULL;
   CeedInt             num_input_fields, num_output_fields;
-  Ceed                ceed;
   CeedQFunctionField *input_fields, *output_fields;
 
   // Check if NULL qf passed in
   if (!qf) return CEED_ERROR_SUCCESS;
 
-  CeedCall(CeedQFunctionGetCeed(qf, &ceed));
-  CeedDebug256(ceed, 1, "---------- CeedOperator Fallback ----------\n");
-  CeedDebug(ceed, "Creating fallback CeedQFunction\n");
+  CeedDebug(CeedQFunctionReturnCeed(qf), "Creating fallback CeedQFunction\n");
 
   if (qf->source_path) {
     size_t path_len = strlen(qf->source_path), name_len = strlen(qf->kernel_name);
+
     CeedCall(CeedCalloc(path_len + name_len + 2, &source_path_with_name));
     memcpy(source_path_with_name, qf->source_path, path_len);
     memcpy(&source_path_with_name[path_len], ":", 1);
     memcpy(&source_path_with_name[path_len + 1], qf->kernel_name, name_len);
+  } else if (qf->user_source) {
+    CeedCall(CeedStringAllocCopy(qf->user_source, &source_path_with_name));
   } else {
     CeedCall(CeedCalloc(1, &source_path_with_name));
   }
@@ -70,6 +70,7 @@ static int CeedQFunctionCreateFallback(Ceed fallback_ceed, CeedQFunction qf, Cee
 
     CeedCall(CeedQFunctionGetContext(qf, &ctx));
     CeedCall(CeedQFunctionSetContext(*qf_fallback, ctx));
+    CeedCall(CeedQFunctionContextDestroy(&ctx));
   }
   CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -112,10 +113,10 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
   // Fallback Ceed
   CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
+  CeedCall(CeedDestroy(&ceed));
   if (!ceed_fallback) return CEED_ERROR_SUCCESS;
 
-  CeedDebug256(ceed, 1, "---------- CeedOperator Fallback ----------\n");
-  CeedDebug(ceed, "Creating fallback CeedOperator\n");
+  CeedDebug(CeedOperatorReturnCeed(op), "Creating fallback CeedOperator\n");
 
   // Clone Op
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
@@ -123,16 +124,17 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
     CeedInt       num_suboperators;
     CeedOperator *sub_operators;
 
-    CeedCall(CeedCompositeOperatorCreate(ceed_fallback, &op_fallback));
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCreateComposite(ceed_fallback, &op_fallback));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     for (CeedInt i = 0; i < num_suboperators; i++) {
       CeedOperator op_sub_fallback;
 
       CeedCall(CeedOperatorGetFallback(sub_operators[i], &op_sub_fallback));
-      CeedCall(CeedCompositeOperatorAddSub(op_fallback, op_sub_fallback));
+      CeedCall(CeedOperatorCompositeAddSub(op_fallback, op_sub_fallback));
     }
   } else {
+    bool               is_at_points = false;
     CeedInt            num_input_fields, num_output_fields;
     CeedQFunction      qf_fallback = NULL, dqf_fallback = NULL, dqfT_fallback = NULL;
     CeedOperatorField *input_fields, *output_fields;
@@ -140,7 +142,19 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
     CeedCall(CeedQFunctionCreateFallback(ceed_fallback, op->qf, &qf_fallback));
     CeedCall(CeedQFunctionCreateFallback(ceed_fallback, op->dqf, &dqf_fallback));
     CeedCall(CeedQFunctionCreateFallback(ceed_fallback, op->dqfT, &dqfT_fallback));
-    CeedCall(CeedOperatorCreate(ceed_fallback, qf_fallback, dqf_fallback, dqfT_fallback, &op_fallback));
+    CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+    if (is_at_points) {
+      CeedVector          points;
+      CeedElemRestriction rstr_points;
+
+      CeedCall(CeedOperatorCreateAtPoints(ceed_fallback, qf_fallback, dqf_fallback, dqfT_fallback, &op_fallback));
+      CeedCall(CeedOperatorAtPointsGetPoints(op, &rstr_points, &points));
+      CeedCall(CeedOperatorAtPointsSetPoints(op_fallback, rstr_points, points));
+      CeedCall(CeedVectorDestroy(&points));
+      CeedCall(CeedElemRestrictionDestroy(&rstr_points));
+    } else {
+      CeedCall(CeedOperatorCreate(ceed_fallback, qf_fallback, dqf_fallback, dqfT_fallback, &op_fallback));
+    }
     CeedCall(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
     for (CeedInt i = 0; i < num_input_fields; i++) {
       const char         *field_name;
@@ -150,6 +164,9 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
 
       CeedCall(CeedOperatorFieldGetData(input_fields[i], &field_name, &rstr, &basis, &vec));
       CeedCall(CeedOperatorSetField(op_fallback, field_name, rstr, basis, vec));
+      CeedCall(CeedVectorDestroy(&vec));
+      CeedCall(CeedElemRestrictionDestroy(&rstr));
+      CeedCall(CeedBasisDestroy(&basis));
     }
     for (CeedInt i = 0; i < num_output_fields; i++) {
       const char         *field_name;
@@ -159,8 +176,16 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
 
       CeedCall(CeedOperatorFieldGetData(output_fields[i], &field_name, &rstr, &basis, &vec));
       CeedCall(CeedOperatorSetField(op_fallback, field_name, rstr, basis, vec));
+      CeedCall(CeedVectorDestroy(&vec));
+      CeedCall(CeedElemRestrictionDestroy(&rstr));
+      CeedCall(CeedBasisDestroy(&basis));
+    }
+    {
+      CeedQFunctionAssemblyData data;
+
+      CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data));
+      CeedCall(CeedQFunctionAssemblyDataReferenceCopy(data, &op_fallback->qf_assembled));
     }
-    CeedCall(CeedQFunctionAssemblyDataReferenceCopy(op->qf_assembled, &op_fallback->qf_assembled));
     // Cleanup
     CeedCall(CeedQFunctionDestroy(&qf_fallback));
     CeedCall(CeedQFunctionDestroy(&dqf_fallback));
@@ -172,6 +197,7 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
   //       The op holds the only reference to op_fallback and is responsible for deleting itself and op_fallback.
   op->op_fallback                 = op_fallback;
   op_fallback->op_fallback_parent = op;
+  CeedCall(CeedDestroy(&ceed_fallback));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -187,14 +213,12 @@ static int CeedOperatorCreateFallback(CeedOperator op) {
 
   @ref Developer
 **/
-static inline int CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(CeedOperator op, CeedRequest *request, const bool is_point_block,
+static inline int CeedOperatorLinearAssembleAddDiagonalSingle_Mesh(CeedOperator op, CeedRequest *request, const bool is_point_block,
                                                                    CeedVector assembled) {
-  Ceed ceed;
   bool is_composite;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
-  CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
+  CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
 
   // Assemble QFunction
   CeedInt             layout_qf[3];
@@ -251,7 +275,7 @@ static inline int CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(CeedOperator
         continue;
       }  // No matching output basis found
     }
-    CeedCheck(active_elem_rstrs_in[b_in] == active_elem_rstrs_out[b_out], ceed, CEED_ERROR_UNSUPPORTED,
+    CeedCheck(active_elem_rstrs_in[b_in] == active_elem_rstrs_out[b_out], CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
               "Cannot assemble operator diagonal with different input and output active element restrictions");
 
     // Assemble point block diagonal restriction, if needed
@@ -374,15 +398,13 @@ static inline int CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(CeedOperator
 
   @ref Developer
 **/
-static inline int CeedSingleOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedRequest *request, const bool is_point_block,
+static inline int CeedOperatorLinearAssembleAddDiagonalSingle(CeedOperator op, CeedRequest *request, const bool is_point_block,
                                                               CeedVector assembled) {
-  Ceed ceed;
   bool is_at_points;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
-  CeedCheck(!is_at_points, ceed, CEED_ERROR_UNSUPPORTED, "AtPoints operator not supported");
-  CeedCall(CeedSingleOperatorLinearAssembleAddDiagonal_Mesh(op, request, is_point_block, assembled));
+  CeedCheck(!is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "AtPoints operator not supported");
+  CeedCall(CeedOperatorLinearAssembleAddDiagonalSingle_Mesh(op, request, is_point_block, assembled));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -398,13 +420,13 @@ static inline int CeedSingleOperatorLinearAssembleAddDiagonal(CeedOperator op, C
 
   @ref Developer
 **/
-static inline int CeedCompositeOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedRequest *request, const bool is_point_block,
+static inline int CeedOperatorLinearAssembleAddDiagonalComposite(CeedOperator op, CeedRequest *request, const bool is_point_block,
                                                                  CeedVector assembled) {
   CeedInt       num_sub;
   CeedOperator *suboperators;
 
-  CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub));
-  CeedCall(CeedCompositeOperatorGetSubList(op, &suboperators));
+  CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub));
+  CeedCall(CeedOperatorCompositeGetSubList(op, &suboperators));
   for (CeedInt i = 0; i < num_sub; i++) {
     if (is_point_block) {
       CeedCall(CeedOperatorLinearAssembleAddPointBlockDiagonal(suboperators[i], assembled, request));
@@ -429,7 +451,7 @@ static inline int CeedCompositeOperatorLinearAssembleAddDiagonal(CeedOperator op
 
   @ref Developer
 **/
-static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, CeedInt *rows, CeedInt *cols) {
+static int CeedOperatorAssembleSymbolicSingle(CeedOperator op, CeedInt offset, CeedInt *rows, CeedInt *cols) {
   Ceed                ceed;
   bool                is_composite;
   CeedSize            num_nodes_in, num_nodes_out, local_num_entries, count = 0;
@@ -440,8 +462,8 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
   CeedVector          index_vec_in, index_vec_out, elem_dof_in, elem_dof_out;
   CeedElemRestriction elem_rstr_in, elem_rstr_out, index_elem_rstr_in, index_elem_rstr_out;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
 
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &num_nodes_in, &num_nodes_out));
@@ -454,7 +476,7 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
   // Determine elem_dof relation for input
   CeedCall(CeedVectorCreate(ceed, num_nodes_in, &index_vec_in));
   CeedCall(CeedVectorGetArrayWrite(index_vec_in, CEED_MEM_HOST, &array));
-  for (CeedInt i = 0; i < num_nodes_in; i++) array[i] = i;
+  for (CeedSize i = 0; i < num_nodes_in; i++) array[i] = i;
   CeedCall(CeedVectorRestoreArray(index_vec_in, &array));
   CeedCall(CeedVectorCreate(ceed, num_elem_in * elem_size_in * num_comp_in, &elem_dof_in));
   CeedCall(CeedVectorSetValue(elem_dof_in, 0.0));
@@ -467,7 +489,9 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
   if (elem_rstr_in != elem_rstr_out) {
     CeedCall(CeedElemRestrictionGetNumElements(elem_rstr_out, &num_elem_out));
     CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED,
-              "Active input and output operator restrictions must have the same number of elements");
+              "Active input and output operator restrictions must have the same number of elements."
+              " Input has %" CeedInt_FMT " elements; output has %" CeedInt_FMT "elements.",
+              num_elem_in, num_elem_out);
     CeedCall(CeedElemRestrictionGetElementSize(elem_rstr_out, &elem_size_out));
     CeedCall(CeedElemRestrictionGetNumComponents(elem_rstr_out, &num_comp_out));
     CeedCall(CeedElemRestrictionGetELayout(elem_rstr_out, layout_er_out));
@@ -475,7 +499,7 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
     // Determine elem_dof relation for output
     CeedCall(CeedVectorCreate(ceed, num_nodes_out, &index_vec_out));
     CeedCall(CeedVectorGetArrayWrite(index_vec_out, CEED_MEM_HOST, &array));
-    for (CeedInt i = 0; i < num_nodes_out; i++) array[i] = i;
+    for (CeedSize i = 0; i < num_nodes_out; i++) array[i] = i;
     CeedCall(CeedVectorRestoreArray(index_vec_out, &array));
     CeedCall(CeedVectorCreate(ceed, num_elem_out * elem_size_out * num_comp_out, &elem_dof_out));
     CeedCall(CeedVectorSetValue(elem_dof_out, 0.0));
@@ -493,7 +517,7 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
     layout_er_out[2] = layout_er_in[2];
     elem_dof_a_out   = elem_dof_a_in;
   }
-  local_num_entries = elem_size_out * num_comp_out * elem_size_in * num_comp_in * num_elem_in;
+  local_num_entries = (CeedSize)elem_size_out * num_comp_out * elem_size_in * num_comp_in * num_elem_in;
 
   // Determine i, j locations for element matrices
   for (CeedInt e = 0; e < num_elem_in; e++) {
@@ -521,9 +545,116 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
     CeedCall(CeedVectorRestoreArrayRead(elem_dof_out, &elem_dof_a_out));
     CeedCall(CeedVectorDestroy(&elem_dof_out));
   }
+  CeedCall(CeedElemRestrictionDestroy(&elem_rstr_in));
+  CeedCall(CeedElemRestrictionDestroy(&elem_rstr_out));
+  CeedCall(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Core logic to assemble `CeedQFunction` and store result internally.
+
+  Return copied references of stored data to the caller.
+  Caller is responsible for ownership and destruction of the copied references.
+  See also @ref CeedOperatorLinearAssembleQFunction().
+
+  Note: If the value of `assembled` or `rstr` passed to this function are non-`NULL` , then it is assumed that they hold valid pointers.
+        These objects will be destroyed if `*assembled` or `*rstr` is the only reference to the object.
+
+  @param[in]  op         `CeedOperator` to assemble `CeedQFunction`
+  @param[in]  use_parent Boolean flag to check for fallback parent implementation
+  @param[out] assembled  `CeedVector` to store assembled `CeedQFunction` at quadrature points
+  @param[out] rstr       `CeedElemRestriction` for `CeedVector` containing assembled `CeedQFunction`
+  @param[in]  request    Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+static int CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(CeedOperator op, bool use_parent, CeedVector *assembled, CeedElemRestriction *rstr,
+                                                                 CeedRequest *request) {
+  int (*LinearAssembleQFunctionUpdate)(CeedOperator, CeedVector, CeedElemRestriction, CeedRequest *) = NULL;
+  CeedOperator op_assemble                                                                           = NULL;
+  CeedOperator op_fallback_parent                                                                    = NULL;
+
+  CeedCall(CeedOperatorCheckReady(op));
+
+  // Determine if fallback parent or operator has implementation
+  CeedCall(CeedOperatorGetFallbackParent(op, &op_fallback_parent));
+  if (op_fallback_parent && use_parent && op_fallback_parent->LinearAssembleQFunctionUpdate) {
+    // -- Backend version for op fallback parent is faster, if it exists
+    CeedDebug(CeedOperatorReturnCeed(op), "Using fallback parent for CeedOperatorLinearAssembleQFunctionBuildOrUpdate\n");
+    LinearAssembleQFunctionUpdate = op_fallback_parent->LinearAssembleQFunctionUpdate;
+    op_assemble                   = op_fallback_parent;
+  } else if (op->LinearAssembleQFunctionUpdate) {
+    // -- Backend version for op
+    LinearAssembleQFunctionUpdate = op->LinearAssembleQFunctionUpdate;
+    op_assemble                   = op;
+  }
+
+  // Assemble QFunction
+  if (LinearAssembleQFunctionUpdate) {
+    // Backend or fallback parent version
+    CeedQFunctionAssemblyData data;
+    bool                      data_is_setup;
+    CeedVector                assembled_vec  = NULL;
+    CeedElemRestriction       assembled_rstr = NULL;
+
+    CeedCall(CeedOperatorGetQFunctionAssemblyData(op, &data));
+    CeedCall(CeedQFunctionAssemblyDataIsSetup(data, &data_is_setup));
+    if (data_is_setup) {
+      bool update_needed;
+
+      CeedCall(CeedQFunctionAssemblyDataGetObjects(data, &assembled_vec, &assembled_rstr));
+      CeedCall(CeedQFunctionAssemblyDataIsUpdateNeeded(data, &update_needed));
+      if (update_needed) CeedCall(LinearAssembleQFunctionUpdate(op_assemble, assembled_vec, assembled_rstr, request));
+    } else {
+      CeedCall(CeedOperatorLinearAssembleQFunction(op_assemble, &assembled_vec, &assembled_rstr, request));
+      CeedCall(CeedQFunctionAssemblyDataSetObjects(data, assembled_vec, assembled_rstr));
+    }
+    CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(data, false));
+
+    // Copy reference from internally held copy
+    CeedCall(CeedVectorReferenceCopy(assembled_vec, assembled));
+    CeedCall(CeedElemRestrictionReferenceCopy(assembled_rstr, rstr));
+    CeedCall(CeedVectorDestroy(&assembled_vec));
+    CeedCall(CeedElemRestrictionDestroy(&assembled_rstr));
+  } else {
+    // Operator fallback
+    CeedOperator op_fallback;
+
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleQFunctionBuildOrUpdate\n");
+    CeedCall(CeedOperatorGetFallback(op, &op_fallback));
+    if (op_fallback) CeedCall(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
+    else return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunctionUpdate");
+  }
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Assemble `CeedQFunction` and store result internally, but do not use fallback parent.
+
+  Return copied references of stored data to the caller.
+  Caller is responsible for ownership and destruction of the copied references.
+  See also @ref CeedOperatorLinearAssembleQFunction().
+
+  Note: If the value of `assembled` or `rstr` passed to this function are non-`NULL` , then it is assumed that they hold valid pointers.
+        These objects will be destroyed if `*assembled` or `*rstr` is the only reference to the object.
+
+  @param[in]  op        `CeedOperator` to assemble `CeedQFunction`
+  @param[out] assembled `CeedVector` to store assembled `CeedQFunction` at quadrature points
+  @param[out] rstr      `CeedElemRestriction` for `CeedVector` containing assembled `CeedQFunction`
+  @param[in]  request   Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
+                                                             CeedRequest *request) {
+  return CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(op, false, assembled, rstr, request);
+}
+
 /**
   @brief Assemble nonzero entries for non-composite `CeedOperator`.
 
@@ -537,13 +668,11 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
 
   @ref Developer
 **/
-static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector values) {
-  Ceed ceed;
-  bool is_composite;
+int CeedOperatorAssembleSingle(CeedOperator op, CeedInt offset, CeedVector values) {
+  bool is_composite, is_at_points;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
-  CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
+  CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
 
   // Early exit for empty operator
   {
@@ -561,13 +690,18 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorAssembleSingle\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
-      CeedCall(CeedSingleOperatorAssemble(op_fallback, offset, values));
+      CeedCall(CeedOperatorAssembleSingle(op_fallback, offset, values));
       return CEED_ERROR_SUCCESS;
     }
   }
 
+  CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+  CeedCheck(!is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+            "Backend does not implement CeedOperatorLinearAssemble for AtPoints operator");
+
   // Assemble QFunction
   CeedInt             layout_qf[3];
   const CeedScalar   *assembled_qf_array;
@@ -597,9 +731,10 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
   CeedCall(CeedOperatorAssemblyDataGetEvalModes(data, &num_active_bases_in, &num_eval_modes_in, &eval_modes_in, NULL, &num_active_bases_out,
                                                 &num_eval_modes_out, &eval_modes_out, NULL, NULL));
 
-  CeedCheck(num_active_bases_in == num_active_bases_out && num_active_bases_in == 1, ceed, CEED_ERROR_UNSUPPORTED,
+  CeedCheck(num_active_bases_in == 1 && num_active_bases_out == 1, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
             "Cannot assemble operator with multiple active bases");
-  CeedCheck(num_eval_modes_in[0] > 0 && num_eval_modes_out[0] > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs");
+  CeedCheck(num_eval_modes_in[0] > 0 && num_eval_modes_out[0] > 0, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+            "Cannot assemble operator without inputs/outputs");
 
   CeedCall(CeedOperatorAssemblyDataGetBases(data, NULL, &active_bases_in, &B_mats_in, NULL, &active_bases_out, &B_mats_out));
   CeedCall(CeedOperatorGetActiveElemRestrictions(op, &elem_rstr_in, &elem_rstr_out));
@@ -623,14 +758,18 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
 
   if (elem_rstr_in != elem_rstr_out) {
     CeedCall(CeedElemRestrictionGetNumElements(elem_rstr_out, &num_elem_out));
-    CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED,
-              "Active input and output operator restrictions must have the same number of elements");
+    CeedCheck(num_elem_in == num_elem_out, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+              "Active input and output operator restrictions must have the same number of elements."
+              " Input has %" CeedInt_FMT " elements; output has %" CeedInt_FMT "elements.",
+              num_elem_in, num_elem_out);
     CeedCall(CeedElemRestrictionGetElementSize(elem_rstr_out, &elem_size_out));
     CeedCall(CeedElemRestrictionGetNumComponents(elem_rstr_out, &num_comp_out));
     if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out;
     else CeedCall(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out));
-    CeedCheck(num_qpts_in == num_qpts_out, ceed, CEED_ERROR_UNSUPPORTED,
-              "Active input and output bases must have the same number of quadrature points");
+    CeedCheck(num_qpts_in == num_qpts_out, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+              "Active input and output bases must have the same number of quadrature points."
+              " Input has %" CeedInt_FMT " points; output has %" CeedInt_FMT "points.",
+              num_qpts_in, num_qpts_out);
 
     CeedCall(CeedElemRestrictionGetType(elem_rstr_out, &elem_rstr_type_out));
     if (elem_rstr_type_out == CEED_RESTRICTION_ORIENTED) {
@@ -647,7 +786,7 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
     elem_rstr_orients_out      = elem_rstr_orients_in;
     elem_rstr_curl_orients_out = elem_rstr_curl_orients_in;
   }
-  local_num_entries = elem_size_out * num_comp_out * elem_size_in * num_comp_in * num_elem_in;
+  local_num_entries = (CeedSize)elem_size_out * num_comp_out * elem_size_in * num_comp_in * num_elem_in;
 
   // Loop over elements and put in data structure
   // We store B_mat_in, B_mat_out, BTD, elem_mat in row-major order
@@ -687,7 +826,11 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
           CeedCall(CeedTensorContractApply(contract, 1, num_qpts_in * num_eval_modes_in[0], elem_size_in, elem_size_out, BTD_mat, CEED_NOTRANSPOSE,
                                            false, B_mat_in, elem_mat));
         } else {
+          Ceed ceed;
+
+          CeedCall(CeedOperatorGetCeed(op, &ceed));
           CeedCall(CeedMatrixMatrixMultiply(ceed, BTD_mat, B_mat_in, elem_mat, elem_size_out, elem_size_in, num_qpts_in * num_eval_modes_in[0]));
+          CeedCall(CeedDestroy(&ceed));
         }
 
         // Transform the element matrix if required
@@ -746,7 +889,7 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
       }
     }
   }
-  CeedCheck(count == local_num_entries, ceed, CEED_ERROR_MAJOR, "Error computing entries");
+  CeedCheck(count == local_num_entries, CeedOperatorReturnCeed(op), CEED_ERROR_MAJOR, "Error computing entries");
   CeedCall(CeedVectorRestoreArray(values, &vals));
 
   // Cleanup
@@ -767,6 +910,8 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
   }
   CeedCall(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array));
   CeedCall(CeedVectorDestroy(&assembled_qf));
+  CeedCall(CeedElemRestrictionDestroy(&elem_rstr_in));
+  CeedCall(CeedElemRestrictionDestroy(&elem_rstr_out));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -780,15 +925,13 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVecto
 
   @ref Utility
 **/
-static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num_entries) {
+static int CeedOperatorAssemblyCountEntriesSingle(CeedOperator op, CeedSize *num_entries) {
   bool                is_composite;
   CeedInt             num_elem_in, elem_size_in, num_comp_in, num_elem_out, elem_size_out, num_comp_out;
-  Ceed                ceed;
   CeedElemRestriction rstr_in, rstr_out;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
-  CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
+  CeedCheck(!is_composite, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
 
   CeedCall(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out));
   CeedCall(CeedElemRestrictionGetNumElements(rstr_in, &num_elem_in));
@@ -796,8 +939,10 @@ static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num
   CeedCall(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in));
   if (rstr_in != rstr_out) {
     CeedCall(CeedElemRestrictionGetNumElements(rstr_out, &num_elem_out));
-    CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED,
-              "Active input and output operator restrictions must have the same number of elements");
+    CeedCheck(num_elem_in == num_elem_out, CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED,
+              "Active input and output operator restrictions must have the same number of elements."
+              " Input has %" CeedInt_FMT " elements; output has %" CeedInt_FMT "elements.",
+              num_elem_in, num_elem_out);
     CeedCall(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out));
     CeedCall(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out));
   } else {
@@ -805,10 +950,48 @@ static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num
     elem_size_out = elem_size_in;
     num_comp_out  = num_comp_in;
   }
+  CeedCall(CeedElemRestrictionDestroy(&rstr_in));
+  CeedCall(CeedElemRestrictionDestroy(&rstr_out));
   *num_entries = (CeedSize)elem_size_in * num_comp_in * elem_size_out * num_comp_out * num_elem_in;
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Count number of entries for assembled `CeedOperator`
+
+  @param[in]  op          `CeedOperator` to assemble
+  @param[out] num_entries Number of entries in assembled representation
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Utility
+**/
+int CeedOperatorLinearAssembleGetNumEntries(CeedOperator op, CeedSize *num_entries) {
+  bool is_composite;
+
+  CeedCall(CeedOperatorCheckReady(op));
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+
+  if (is_composite) {
+    CeedInt       num_suboperators;
+    CeedOperator *sub_operators;
+
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
+
+    *num_entries = 0;
+    for (CeedInt k = 0; k < num_suboperators; ++k) {
+      CeedSize single_entries;
+
+      CeedCall(CeedOperatorAssemblyCountEntriesSingle(sub_operators[k], &single_entries));
+      *num_entries += single_entries;
+    }
+  } else {
+    CeedCall(CeedOperatorAssemblyCountEntriesSingle(op, num_entries));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Common code for creating a multigrid coarse `CeedOperator` and level transfer `CeedOperator` for a `CeedOperator`
 
@@ -825,11 +1008,12 @@ static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num
 
   @ref Developer
 **/
-static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse,
-                                            CeedBasis basis_c_to_f, CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_restrict) {
+static int CeedOperatorMultigridLevelCreateSingle_Core(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse,
+                                                       CeedBasis basis_coarse, CeedBasis basis_c_to_f, CeedOperator *op_coarse,
+                                                       CeedOperator *op_prolong, CeedOperator *op_restrict) {
   bool                is_composite;
   Ceed                ceed;
-  CeedInt             num_comp, num_input_fields, num_output_fields;
+  CeedInt             dim              = 0, num_comp, num_input_fields, num_output_fields;
   CeedVector          mult_vec         = NULL;
   CeedElemRestriction rstr_p_mult_fine = NULL, rstr_fine = NULL;
   CeedOperatorField  *input_fields, *output_fields;
@@ -841,66 +1025,144 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m
   CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Automatic multigrid setup for composite operators not supported");
 
   // Coarse Grid
-  CeedCall(CeedOperatorCreate(ceed, op_fine->qf, op_fine->dqf, op_fine->dqfT, op_coarse));
+  {
+    bool is_at_points;
+
+    CeedCall(CeedOperatorIsAtPoints(op_fine, &is_at_points));
+    if (is_at_points) {
+      CeedVector          point_coords;
+      CeedElemRestriction rstr_points;
+
+      CeedCall(CeedOperatorCreateAtPoints(ceed, op_fine->qf, op_fine->dqf, op_fine->dqfT, op_coarse));
+      CeedCall(CeedOperatorAtPointsGetPoints(op_fine, &rstr_points, &point_coords));
+      CeedCall(CeedOperatorAtPointsSetPoints(*op_coarse, rstr_points, point_coords));
+      CeedCall(CeedVectorDestroy(&point_coords));
+      CeedCall(CeedElemRestrictionDestroy(&rstr_points));
+    } else {
+      CeedCall(CeedOperatorCreate(ceed, op_fine->qf, op_fine->dqf, op_fine->dqfT, op_coarse));
+    }
+  }
   CeedCall(CeedOperatorGetFields(op_fine, &num_input_fields, &input_fields, &num_output_fields, &output_fields));
   // -- Clone input fields
   for (CeedInt i = 0; i < num_input_fields; i++) {
     const char         *field_name;
     CeedVector          vec;
-    CeedElemRestriction rstr;
-    CeedBasis           basis;
+    CeedElemRestriction rstr  = NULL;
+    CeedBasis           basis = NULL;
 
     CeedCall(CeedOperatorFieldGetName(input_fields[i], &field_name));
     CeedCall(CeedOperatorFieldGetVector(input_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      rstr  = rstr_coarse;
-      basis = basis_coarse;
-      CeedCall(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_fine));
+      CeedCall(CeedElemRestrictionReferenceCopy(rstr_coarse, &rstr));
+      CeedCall(CeedBasisReferenceCopy(basis_coarse, &basis));
+      if (!rstr_fine) CeedCall(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_fine));
     } else {
       CeedCall(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr));
       CeedCall(CeedOperatorFieldGetBasis(input_fields[i], &basis));
     }
+    if (dim == 0) CeedCall(CeedBasisGetDimension(basis, &dim));
     CeedCall(CeedOperatorSetField(*op_coarse, field_name, rstr, basis, vec));
+    CeedCall(CeedVectorDestroy(&vec));
+    CeedCall(CeedElemRestrictionDestroy(&rstr));
+    CeedCall(CeedBasisDestroy(&basis));
   }
   // -- Clone output fields
   for (CeedInt i = 0; i < num_output_fields; i++) {
     const char         *field_name;
     CeedVector          vec;
-    CeedElemRestriction rstr;
-    CeedBasis           basis;
+    CeedElemRestriction rstr  = NULL;
+    CeedBasis           basis = NULL;
 
     CeedCall(CeedOperatorFieldGetName(output_fields[i], &field_name));
     CeedCall(CeedOperatorFieldGetVector(output_fields[i], &vec));
     if (vec == CEED_VECTOR_ACTIVE) {
-      rstr  = rstr_coarse;
-      basis = basis_coarse;
-      CeedCall(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_fine));
+      CeedCall(CeedElemRestrictionReferenceCopy(rstr_coarse, &rstr));
+      CeedCall(CeedBasisReferenceCopy(basis_coarse, &basis));
+      if (!rstr_fine) CeedCall(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_fine));
     } else {
       CeedCall(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr));
       CeedCall(CeedOperatorFieldGetBasis(output_fields[i], &basis));
     }
+    if (dim == 0) CeedCall(CeedBasisGetDimension(basis, &dim));
     CeedCall(CeedOperatorSetField(*op_coarse, field_name, rstr, basis, vec));
+    CeedCall(CeedVectorDestroy(&vec));
+    CeedCall(CeedElemRestrictionDestroy(&rstr));
+    CeedCall(CeedBasisDestroy(&basis));
   }
+  dim = dim ? dim : 1;
   // -- Clone QFunctionAssemblyData
-  CeedCall(CeedQFunctionAssemblyDataReferenceCopy(op_fine->qf_assembled, &(*op_coarse)->qf_assembled));
+  {
+    CeedQFunctionAssemblyData fine_data;
+
+    CeedCall(CeedOperatorGetQFunctionAssemblyData(op_fine, &fine_data));
+    CeedCall(CeedQFunctionAssemblyDataReferenceCopy(fine_data, &(*op_coarse)->qf_assembled));
+  }
 
   // Multiplicity vector
+  bool use_scalar_mult = true;
+
   if (op_restrict || op_prolong) {
-    CeedVector          mult_e_vec;
+    CeedInt             num_elem, num_comp, elem_size;
+    CeedVector          mult_l_vec, mult_e_vec;
     CeedRestrictionType rstr_type;
+    CeedElemRestriction rstr_p_mult_full;
 
     CeedCall(CeedElemRestrictionGetType(rstr_fine, &rstr_type));
     CeedCheck(rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_UNSUPPORTED,
               "Element restrictions created with CeedElemRestrictionCreateCurlOriented are not supported");
     CeedCheck(p_mult_fine, ceed, CEED_ERROR_INCOMPATIBLE, "Prolongation or restriction operator creation requires fine grid multiplicity vector");
-    CeedCall(CeedElemRestrictionCreateUnsignedCopy(rstr_fine, &rstr_p_mult_fine));
-    CeedCall(CeedElemRestrictionCreateVector(rstr_fine, &mult_vec, &mult_e_vec));
+
+    // Create multiplicity multi-component l-vector
+    CeedCall(CeedElemRestrictionCreateUnsignedCopy(rstr_fine, &rstr_p_mult_full));
+    CeedCall(CeedElemRestrictionGetNumElements(rstr_p_mult_full, &num_elem));
+    CeedCall(CeedElemRestrictionGetNumComponents(rstr_p_mult_full, &num_comp));
+    CeedCall(CeedElemRestrictionGetElementSize(rstr_p_mult_full, &elem_size));
+    CeedCall(CeedElemRestrictionCreateVector(rstr_fine, &mult_l_vec, &mult_e_vec));
     CeedCall(CeedVectorSetValue(mult_e_vec, 0.0));
-    CeedCall(CeedElemRestrictionApply(rstr_p_mult_fine, CEED_NOTRANSPOSE, p_mult_fine, mult_e_vec, CEED_REQUEST_IMMEDIATE));
-    CeedCall(CeedVectorSetValue(mult_vec, 0.0));
-    CeedCall(CeedElemRestrictionApply(rstr_p_mult_fine, CEED_TRANSPOSE, mult_e_vec, mult_vec, CEED_REQUEST_IMMEDIATE));
+    CeedCall(CeedElemRestrictionApply(rstr_p_mult_full, CEED_NOTRANSPOSE, p_mult_fine, mult_e_vec, CEED_REQUEST_IMMEDIATE));
+    CeedCall(CeedVectorSetValue(mult_l_vec, 0.0));
+    CeedCall(CeedElemRestrictionApply(rstr_p_mult_full, CEED_TRANSPOSE, mult_e_vec, mult_l_vec, CEED_REQUEST_IMMEDIATE));
+    CeedCall(CeedVectorReciprocal(mult_l_vec));
+
+    // Determine to use scalar multiplicity or not
+    {
+      const CeedInt p = pow(elem_size, 1.0 / dim);
+
+      use_scalar_mult = num_comp > 1 && (dim < 3 || num_comp - 1 > (3 * (pow(p, dim - 1) - pow(p, dim - 2)) + 1) / pow(p - 1, dim));
+    }
+
+    if (use_scalar_mult) {
+      // Create multiplicity single component e-vector
+      CeedCall(CeedElemRestrictionCreateStrided(ceed, num_elem, elem_size, 1, num_elem * elem_size, CEED_STRIDES_BACKEND, &rstr_p_mult_fine));
+      CeedCall(CeedElemRestrictionCreateVector(rstr_p_mult_fine, &mult_vec, NULL));
+      {
+        CeedQFunction qf_to_scalar;
+        CeedOperator  op_to_scalar;
+
+        CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Identity to scalar", &qf_to_scalar));
+        CeedCall(CeedQFunctionAddInput(qf_to_scalar, "input", num_comp, CEED_EVAL_NONE));
+        CeedCall(CeedQFunctionAddOutput(qf_to_scalar, "output", 1, CEED_EVAL_NONE));
+
+        CeedCall(CeedOperatorCreate(ceed, qf_to_scalar, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_to_scalar));
+        CeedCall(CeedOperatorSetField(op_to_scalar, "input", rstr_p_mult_full, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+        CeedCall(CeedOperatorSetField(op_to_scalar, "output", rstr_p_mult_fine, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE));
+
+        CeedCall(CeedOperatorApply(op_to_scalar, mult_l_vec, mult_vec, CEED_REQUEST_IMMEDIATE));
+
+        // Clean-up
+        CeedCall(CeedQFunctionDestroy(&qf_to_scalar));
+        CeedCall(CeedOperatorDestroy(&op_to_scalar));
+      }
+    } else {
+      mult_vec = NULL;
+      CeedCall(CeedVectorReferenceCopy(mult_l_vec, &mult_vec));
+      rstr_p_mult_fine = NULL;
+      CeedCall(CeedElemRestrictionReferenceCopy(rstr_p_mult_full, &rstr_p_mult_fine));
+    }
+    // Clean-up
     CeedCall(CeedVectorDestroy(&mult_e_vec));
-    CeedCall(CeedVectorReciprocal(mult_vec));
+    CeedCall(CeedVectorDestroy(&mult_l_vec));
+    CeedCall(CeedElemRestrictionDestroy(&rstr_p_mult_full));
   }
 
   // Clone name
@@ -921,7 +1183,7 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m
     CeedQFunctionContext ctx_r;
     CeedQFunction        qf_restrict;
 
-    CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale", &qf_restrict));
+    CeedCall(CeedQFunctionCreateInteriorByName(ceed, use_scalar_mult ? "Scale (scalar)" : "Scale", &qf_restrict));
     CeedCall(CeedCalloc(1, &num_comp_r_data));
     num_comp_r_data[0] = num_comp;
     CeedCall(CeedQFunctionContextCreate(ceed, &ctx_r));
@@ -929,7 +1191,7 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m
     CeedCall(CeedQFunctionSetContext(qf_restrict, ctx_r));
     CeedCall(CeedQFunctionContextDestroy(&ctx_r));
     CeedCall(CeedQFunctionAddInput(qf_restrict, "input", num_comp, CEED_EVAL_NONE));
-    CeedCall(CeedQFunctionAddInput(qf_restrict, "scale", num_comp, CEED_EVAL_NONE));
+    CeedCall(CeedQFunctionAddInput(qf_restrict, "scale", use_scalar_mult ? 1 : num_comp, CEED_EVAL_NONE));
     CeedCall(CeedQFunctionAddOutput(qf_restrict, "output", num_comp, CEED_EVAL_INTERP));
     CeedCall(CeedQFunctionSetUserFlopsEstimate(qf_restrict, num_comp));
 
@@ -959,7 +1221,7 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m
     CeedQFunctionContext ctx_p;
     CeedQFunction        qf_prolong;
 
-    CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale", &qf_prolong));
+    CeedCall(CeedQFunctionCreateInteriorByName(ceed, use_scalar_mult ? "Scale (scalar)" : "Scale", &qf_prolong));
     CeedCall(CeedCalloc(1, &num_comp_p_data));
     num_comp_p_data[0] = num_comp;
     CeedCall(CeedQFunctionContextCreate(ceed, &ctx_p));
@@ -967,7 +1229,7 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m
     CeedCall(CeedQFunctionSetContext(qf_prolong, ctx_p));
     CeedCall(CeedQFunctionContextDestroy(&ctx_p));
     CeedCall(CeedQFunctionAddInput(qf_prolong, "input", num_comp, CEED_EVAL_INTERP));
-    CeedCall(CeedQFunctionAddInput(qf_prolong, "scale", num_comp, CEED_EVAL_NONE));
+    CeedCall(CeedQFunctionAddInput(qf_prolong, "scale", use_scalar_mult ? 1 : num_comp, CEED_EVAL_NONE));
     CeedCall(CeedQFunctionAddOutput(qf_prolong, "output", num_comp, CEED_EVAL_NONE));
     CeedCall(CeedQFunctionSetUserFlopsEstimate(qf_prolong, num_comp));
 
@@ -995,7 +1257,9 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m
   CeedCall(CeedOperatorCheckReady(*op_coarse));
 
   // Cleanup
+  CeedCall(CeedDestroy(&ceed));
   CeedCall(CeedVectorDestroy(&mult_vec));
+  CeedCall(CeedElemRestrictionDestroy(&rstr_fine));
   CeedCall(CeedElemRestrictionDestroy(&rstr_p_mult_fine));
   CeedCall(CeedBasisDestroy(&basis_c_to_f));
   return CEED_ERROR_SUCCESS;
@@ -1122,6 +1386,28 @@ int CeedOperatorCreateActivePointBlockRestriction(CeedElemRestriction rstr, Ceed
 
   // Cleanup
   CeedCall(CeedElemRestrictionRestoreOffsets(rstr, &offsets));
+  CeedCall(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get `CeedQFunctionAssemblyData`
+
+  @param[in]  op   `CeedOperator` to assemble
+  @param[out] data `CeedQFunctionAssemblyData`
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedOperatorGetQFunctionAssemblyData(CeedOperator op, CeedQFunctionAssemblyData *data) {
+  if (!op->qf_assembled) {
+    CeedQFunctionAssemblyData data;
+
+    CeedCall(CeedQFunctionAssemblyDataCreate(CeedOperatorReturnCeed(op), &data));
+    op->qf_assembled = data;
+  }
+  *data = op->qf_assembled;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1138,8 +1424,7 @@ int CeedOperatorCreateActivePointBlockRestriction(CeedElemRestriction rstr, Ceed
 int CeedQFunctionAssemblyDataCreate(Ceed ceed, CeedQFunctionAssemblyData *data) {
   CeedCall(CeedCalloc(1, data));
   (*data)->ref_count = 1;
-  (*data)->ceed      = ceed;
-  CeedCall(CeedReference(ceed));
+  CeedCall(CeedReferenceCopy(ceed, &(*data)->ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1304,7 +1589,7 @@ int CeedQFunctionAssemblyDataDestroy(CeedQFunctionAssemblyData *data) {
   @brief Get `CeedOperatorAssemblyData`
 
   @param[in]  op   `CeedOperator` to assemble
-  @param[out] data `CeedQFunctionAssemblyData`
+  @param[out] data `CeedOperatorAssemblyData`
 
   @return An error code: 0 - success, otherwise - failure
 
@@ -1314,7 +1599,7 @@ int CeedOperatorGetOperatorAssemblyData(CeedOperator op, CeedOperatorAssemblyDat
   if (!op->op_assembled) {
     CeedOperatorAssemblyData data;
 
-    CeedCall(CeedOperatorAssemblyDataCreate(op->ceed, op, &data));
+    CeedCall(CeedOperatorAssemblyDataCreate(CeedOperatorReturnCeed(op), op, &data));
     op->op_assembled = data;
   }
   *data = op->op_assembled;
@@ -1354,8 +1639,7 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem
 
   // Allocate
   CeedCall(CeedCalloc(1, data));
-  (*data)->ceed = ceed;
-  CeedCall(CeedReference(ceed));
+  CeedCall(CeedReferenceCopy(ceed, &(*data)->ceed));
 
   // Build OperatorAssembly data
   CeedCall(CeedOperatorGetQFunction(op, &qf));
@@ -1390,6 +1674,7 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem
         (*data)->active_elem_rstrs_in[num_active_bases_in] = NULL;
         CeedCall(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr_in));
         CeedCall(CeedElemRestrictionReferenceCopy(elem_rstr_in, &(*data)->active_elem_rstrs_in[num_active_bases_in]));
+        CeedCall(CeedElemRestrictionDestroy(&elem_rstr_in));
         CeedCall(CeedRealloc(num_active_bases_in + 1, &num_eval_modes_in));
         num_eval_modes_in[index] = 0;
         CeedCall(CeedRealloc(num_active_bases_in + 1, &eval_modes_in));
@@ -1411,7 +1696,9 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem
         }
         num_eval_modes_in[index] += q_comp;
       }
+      CeedCall(CeedBasisDestroy(&basis_in));
     }
+    CeedCall(CeedVectorDestroy(&vec));
   }
 
   // Determine active output basis
@@ -1445,6 +1732,7 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem
         (*data)->active_elem_rstrs_out[num_active_bases_out] = NULL;
         CeedCall(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr_out));
         CeedCall(CeedElemRestrictionReferenceCopy(elem_rstr_out, &(*data)->active_elem_rstrs_out[num_active_bases_out]));
+        CeedCall(CeedElemRestrictionDestroy(&elem_rstr_out));
         CeedCall(CeedRealloc(num_active_bases_out + 1, &num_eval_modes_out));
         num_eval_modes_out[index] = 0;
         CeedCall(CeedRealloc(num_active_bases_out + 1, &eval_modes_out));
@@ -1466,8 +1754,11 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem
         }
         num_eval_modes_out[index] += q_comp;
       }
+      CeedCall(CeedBasisDestroy(&basis_out));
     }
+    CeedCall(CeedVectorDestroy(&vec));
   }
+  CeedCall(CeedQFunctionDestroy(&qf));
   (*data)->num_active_bases_in   = num_active_bases_in;
   (*data)->num_eval_modes_in     = num_eval_modes_in;
   (*data)->eval_modes_in         = eval_modes_in;
@@ -1736,15 +2027,19 @@ int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback) {
     CeedCall(CeedIsDebug(ceed, &is_debug));
     if (is_debug) {
       Ceed        ceed_fallback;
-      const char *resource, *resource_fallback;
+      const char *resource, *resource_fallback, *op_name;
 
       CeedCall(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
       CeedCall(CeedGetResource(ceed, &resource));
       CeedCall(CeedGetResource(ceed_fallback, &resource_fallback));
+      CeedCall(CeedOperatorGetName(op, &op_name));
 
       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- CeedOperator Fallback ----------\n");
-      CeedDebug(ceed, "Falling back from %s operator at address %p to %s operator at address %p\n", resource, op, resource_fallback, op->op_fallback);
+      CeedDebug(ceed, "Falling back from Operator with backend %s at address %p to Operator with backend %s at address %p for CeedOperator \"%s\"\n",
+                resource, op, resource_fallback, op->op_fallback, op_name);
+      CeedCall(CeedDestroy(&ceed_fallback));
     }
+    CeedCall(CeedDestroy(&ceed));
   }
   *op_fallback = op->op_fallback;
   return CEED_ERROR_SUCCESS;
@@ -1776,7 +2071,9 @@ int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent) {
   @ref Backend
 **/
 int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent) {
-  *parent = op->op_fallback_parent ? op->op_fallback_parent->ceed : op->ceed;
+  *parent = NULL;
+  if (op->op_fallback_parent) CeedCall(CeedReferenceCopy(CeedOperatorReturnCeed(op->op_fallback_parent), parent));
+  else CeedCall(CeedReferenceCopy(CeedOperatorReturnCeed(op), parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1816,13 +2113,12 @@ int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled,
     CeedCall(op->LinearAssembleQFunction(op, assembled, rstr, request));
   } else {
     // Operator fallback
-    Ceed         ceed;
     CeedOperator op_fallback;
 
-    CeedCall(CeedOperatorGetCeed(op, &ceed));
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleQFunction\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) CeedCall(CeedOperatorLinearAssembleQFunction(op_fallback, assembled, rstr, request));
-    else return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunction");
+    else return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunction");
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1847,60 +2143,7 @@ int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled,
   @ref User
 **/
 int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) {
-  int (*LinearAssembleQFunctionUpdate)(CeedOperator, CeedVector, CeedElemRestriction, CeedRequest *) = NULL;
-  CeedOperator op_assemble                                                                           = NULL;
-  CeedOperator op_fallback_parent                                                                    = NULL;
-
-  CeedCall(CeedOperatorCheckReady(op));
-
-  // Determine if fallback parent or operator has implementation
-  CeedCall(CeedOperatorGetFallbackParent(op, &op_fallback_parent));
-  if (op_fallback_parent && op_fallback_parent->LinearAssembleQFunctionUpdate) {
-    // -- Backend version for op fallback parent is faster, if it exists
-    LinearAssembleQFunctionUpdate = op_fallback_parent->LinearAssembleQFunctionUpdate;
-    op_assemble                   = op_fallback_parent;
-  } else if (op->LinearAssembleQFunctionUpdate) {
-    // -- Backend version for op
-    LinearAssembleQFunctionUpdate = op->LinearAssembleQFunctionUpdate;
-    op_assemble                   = op;
-  }
-
-  // Assemble QFunction
-  if (LinearAssembleQFunctionUpdate) {
-    // Backend or fallback parent version
-    bool                qf_assembled_is_setup;
-    CeedVector          assembled_vec  = NULL;
-    CeedElemRestriction assembled_rstr = NULL;
-
-    CeedCall(CeedQFunctionAssemblyDataIsSetup(op->qf_assembled, &qf_assembled_is_setup));
-    if (qf_assembled_is_setup) {
-      bool update_needed;
-
-      CeedCall(CeedQFunctionAssemblyDataGetObjects(op->qf_assembled, &assembled_vec, &assembled_rstr));
-      CeedCall(CeedQFunctionAssemblyDataIsUpdateNeeded(op->qf_assembled, &update_needed));
-      if (update_needed) CeedCall(LinearAssembleQFunctionUpdate(op_assemble, assembled_vec, assembled_rstr, request));
-    } else {
-      CeedCall(CeedOperatorLinearAssembleQFunction(op_assemble, &assembled_vec, &assembled_rstr, request));
-      CeedCall(CeedQFunctionAssemblyDataSetObjects(op->qf_assembled, assembled_vec, assembled_rstr));
-    }
-    CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(op->qf_assembled, false));
-
-    // Copy reference from internally held copy
-    CeedCall(CeedVectorReferenceCopy(assembled_vec, assembled));
-    CeedCall(CeedElemRestrictionReferenceCopy(assembled_rstr, rstr));
-    CeedCall(CeedVectorDestroy(&assembled_vec));
-    CeedCall(CeedElemRestrictionDestroy(&assembled_rstr));
-  } else {
-    // Operator fallback
-    Ceed         ceed;
-    CeedOperator op_fallback;
-
-    CeedCall(CeedOperatorGetCeed(op, &ceed));
-    CeedCall(CeedOperatorGetFallback(op, &op_fallback));
-    if (op_fallback) CeedCall(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request));
-    else return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunctionUpdate");
-  }
-  return CEED_ERROR_SUCCESS;
+  return CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(op, true, assembled, rstr, request);
 }
 
 /**
@@ -1923,14 +2166,12 @@ int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector
 int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   bool     is_composite;
   CeedSize input_size = 0, output_size = 0;
-  Ceed     ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorCheckReady(op));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
 
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
-  CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square");
+  CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square");
 
   // Early exit for empty operator
   if (!is_composite) {
@@ -1949,10 +2190,16 @@ int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, Ce
     CeedCall(CeedVectorSetValue(assembled, 0.0));
     CeedCall(op->LinearAssembleAddDiagonal(op, assembled, request));
     return CEED_ERROR_SUCCESS;
+  } else if (is_composite) {
+    // Default to summing contributions of suboperators
+    CeedCall(CeedVectorSetValue(assembled, 0.0));
+    CeedCall(CeedOperatorLinearAssembleAddDiagonalComposite(op, request, false, assembled));
+    return CEED_ERROR_SUCCESS;
   } else {
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleDiagonal\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorLinearAssembleDiagonal(op_fallback, assembled, request));
@@ -1985,14 +2232,12 @@ int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, Ce
 int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   bool     is_composite;
   CeedSize input_size = 0, output_size = 0;
-  Ceed     ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorCheckReady(op));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
 
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
-  CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square");
+  CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square");
 
   // Early exit for empty operator
   if (!is_composite) {
@@ -2006,10 +2251,15 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled,
     // Backend version
     CeedCall(op->LinearAssembleAddDiagonal(op, assembled, request));
     return CEED_ERROR_SUCCESS;
+  } else if (is_composite) {
+    // Default to summing contributions of suboperators
+    CeedCall(CeedOperatorLinearAssembleAddDiagonalComposite(op, request, false, assembled));
+    return CEED_ERROR_SUCCESS;
   } else {
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleAddDiagonal\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request));
@@ -2017,11 +2267,7 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled,
     }
   }
   // Default interface implementation
-  if (is_composite) {
-    CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, false, assembled));
-  } else {
-    CeedCall(CeedSingleOperatorLinearAssembleAddDiagonal(op, request, false, assembled));
-  }
+  CeedCall(CeedOperatorLinearAssembleAddDiagonalSingle(op, request, false, assembled));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2046,21 +2292,19 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled,
    @ref User
 **/
 int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols) {
-  Ceed          ceed;
   bool          is_composite;
   CeedInt       num_active_components, num_sub_operators;
   CeedOperator *sub_operators;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
 
   CeedSize input_size = 0, output_size = 0;
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
-  CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square");
+  CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square");
 
   if (is_composite) {
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_sub_operators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_sub_operators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
   } else {
     sub_operators     = &op;
     num_sub_operators = 1;
@@ -2086,11 +2330,13 @@ int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSi
         CeedInt comp_stride_sub, num_active_components_sub;
 
         CeedCall(CeedElemRestrictionGetCompStride(active_elem_rstrs[i], &comp_stride_sub));
-        CeedCheck(comp_stride == comp_stride_sub, ceed, CEED_ERROR_DIMENSION,
+        CeedCheck(comp_stride == comp_stride_sub, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION,
                   "Active element restrictions must have the same component stride: %d vs %d", comp_stride, comp_stride_sub);
         CeedCall(CeedElemRestrictionGetNumComponents(active_elem_rstrs[i], &num_active_components_sub));
-        CeedCheck(num_active_components == num_active_components_sub, ceed, CEED_ERROR_INCOMPATIBLE,
-                  "All suboperators must have the same number of output components");
+        CeedCheck(num_active_components == num_active_components_sub, CeedOperatorReturnCeed(op), CEED_ERROR_INCOMPATIBLE,
+                  "All suboperators must have the same number of output components."
+                  " Previous: %" CeedInt_FMT " Current: %" CeedInt_FMT,
+                  num_active_components, num_active_components_sub);
       }
     }
   }
@@ -2123,6 +2369,7 @@ int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSi
 
     CeedCall(CeedElemRestrictionRestoreOffsets(active_elem_rstr, &offsets));
     CeedCall(CeedElemRestrictionRestoreOffsets(point_block_active_elem_rstr, &point_block_offsets));
+    CeedCall(CeedElemRestrictionDestroy(&active_elem_rstr));
     CeedCall(CeedElemRestrictionDestroy(&point_block_active_elem_rstr));
   }
   return CEED_ERROR_SUCCESS;
@@ -2150,14 +2397,12 @@ int CeedOperatorLinearAssemblePointBlockDiagonalSymbolic(CeedOperator op, CeedSi
 int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   bool     is_composite;
   CeedSize input_size = 0, output_size = 0;
-  Ceed     ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorCheckReady(op));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
 
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
-  CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square");
+  CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square");
 
   // Early exit for empty operator
   if (!is_composite) {
@@ -2180,6 +2425,7 @@ int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, CeedVector ass
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssemblePointBlockDiagonal\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorLinearAssemblePointBlockDiagonal(op_fallback, assembled, request));
@@ -2214,14 +2460,12 @@ int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, CeedVector ass
 int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) {
   bool     is_composite;
   CeedSize input_size = 0, output_size = 0;
-  Ceed     ceed;
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedOperatorCheckReady(op));
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
 
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size));
-  CeedCheck(input_size == output_size, ceed, CEED_ERROR_DIMENSION, "Operator must be square");
+  CeedCheck(input_size == output_size, CeedOperatorReturnCeed(op), CEED_ERROR_DIMENSION, "Operator must be square");
 
   // Early exit for empty operator
   if (!is_composite) {
@@ -2239,6 +2483,7 @@ int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleAddPointBlockDiagonal\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorLinearAssembleAddPointBlockDiagonal(op_fallback, assembled, request));
@@ -2247,9 +2492,9 @@ int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector
   }
   // Default interface implementation
   if (is_composite) {
-    CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, true, assembled));
+    CeedCall(CeedOperatorLinearAssembleAddDiagonalComposite(op, request, true, assembled));
   } else {
-    CeedCall(CeedSingleOperatorLinearAssembleAddDiagonal(op, request, true, assembled));
+    CeedCall(CeedOperatorLinearAssembleAddDiagonalSingle(op, request, true, assembled));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -2291,6 +2536,7 @@ int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, C
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssembleSymbolic\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorLinearAssembleSymbolic(op_fallback, num_entries, rows, cols));
@@ -2301,32 +2547,21 @@ int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, C
   // Default interface implementation
 
   // Count entries and allocate rows, cols arrays
-  *num_entries = 0;
-  if (is_composite) {
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-    for (CeedInt k = 0; k < num_suboperators; ++k) {
-      CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries));
-      *num_entries += single_entries;
-    }
-  } else {
-    CeedCall(CeedSingleOperatorAssemblyCountEntries(op, &single_entries));
-    *num_entries += single_entries;
-  }
+  CeedCall(CeedOperatorLinearAssembleGetNumEntries(op, num_entries));
   CeedCall(CeedCalloc(*num_entries, rows));
   CeedCall(CeedCalloc(*num_entries, cols));
 
   // Assemble nonzero locations
   if (is_composite) {
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
     for (CeedInt k = 0; k < num_suboperators; ++k) {
-      CeedCall(CeedSingleOperatorAssembleSymbolic(sub_operators[k], offset, *rows, *cols));
-      CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries));
+      CeedCall(CeedOperatorAssembleSymbolicSingle(sub_operators[k], offset, *rows, *cols));
+      CeedCall(CeedOperatorAssemblyCountEntriesSingle(sub_operators[k], &single_entries));
       offset += single_entries;
     }
   } else {
-    CeedCall(CeedSingleOperatorAssembleSymbolic(op, offset, *rows, *cols));
+    CeedCall(CeedOperatorAssembleSymbolicSingle(op, offset, *rows, *cols));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -2370,10 +2605,26 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) {
     // Backend version
     CeedCall(op->LinearAssemble(op, values));
     return CEED_ERROR_SUCCESS;
+  } else if (is_composite) {
+    // Default to summing contributions of suboperators
+    CeedCall(CeedVectorSetValue(values, 0.0));
+    CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
+    CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
+    for (CeedInt k = 0; k < num_suboperators; k++) {
+      CeedCall(CeedOperatorAssembleSingle(sub_operators[k], offset, values));
+      CeedCall(CeedOperatorAssemblyCountEntriesSingle(sub_operators[k], &single_entries));
+      offset += single_entries;
+    }
+    return CEED_ERROR_SUCCESS;
+  } else if (op->LinearAssembleSingle) {
+    CeedCall(CeedVectorSetValue(values, 0.0));
+    CeedCall(CeedOperatorAssembleSingle(op, offset, values));
+    return CEED_ERROR_SUCCESS;
   } else {
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorLinearAssemble\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorLinearAssemble(op_fallback, values));
@@ -2381,19 +2632,9 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) {
     }
   }
 
-  // Default interface implementation
+  // Default to interface version if non-composite and no fallback
   CeedCall(CeedVectorSetValue(values, 0.0));
-  if (is_composite) {
-    CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-    CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
-    for (CeedInt k = 0; k < num_suboperators; k++) {
-      CeedCall(CeedSingleOperatorAssemble(sub_operators[k], offset, values));
-      CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries));
-      offset += single_entries;
-    }
-  } else {
-    CeedCall(CeedSingleOperatorAssemble(op, offset, values));
-  }
+  CeedCall(CeedOperatorAssembleSingle(op, offset, values));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2411,7 +2652,7 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) {
 
   @ref User
 **/
-int CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult) {
+int CeedOperatorCompositeGetMultiplicity(CeedOperator op, CeedInt num_skip_indices, CeedInt *skip_indices, CeedVector mult) {
   Ceed                ceed;
   CeedInt             num_suboperators;
   CeedSize            l_vec_len;
@@ -2422,19 +2663,19 @@ int CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indic
 
   CeedCall(CeedOperatorCheckReady(op));
 
-  CeedCall(CeedOperatorGetCeed(op, &ceed));
-
   // Zero mult vector
   CeedCall(CeedVectorSetValue(mult, 0.0));
 
   // Get suboperators
-  CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators));
-  CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators));
+  CeedCall(CeedOperatorCompositeGetNumSub(op, &num_suboperators));
   if (num_suboperators == 0) return CEED_ERROR_SUCCESS;
+  CeedCall(CeedOperatorCompositeGetSubList(op, &sub_operators));
 
   // Work vector
   CeedCall(CeedVectorGetLength(mult, &l_vec_len));
+  CeedCall(CeedOperatorGetCeed(op, &ceed));
   CeedCall(CeedVectorCreate(ceed, l_vec_len, &ones_l_vec));
+  CeedCall(CeedDestroy(&ceed));
   CeedCall(CeedVectorSetValue(ones_l_vec, 1.0));
   CeedCall(CeedVectorGetArray(mult, CEED_MEM_HOST, &mult_array));
 
@@ -2451,13 +2692,14 @@ int CeedCompositeOperatorGetMultiplicity(CeedOperator op, CeedInt num_skip_indic
     // -- Sub operator multiplicity
     CeedCall(CeedOperatorGetActiveElemRestriction(sub_operators[i], &elem_rstr));
     CeedCall(CeedElemRestrictionCreateUnorientedCopy(elem_rstr, &mult_elem_rstr));
+    CeedCall(CeedElemRestrictionDestroy(&elem_rstr));
     CeedCall(CeedElemRestrictionCreateVector(mult_elem_rstr, &sub_mult_l_vec, &ones_e_vec));
     CeedCall(CeedVectorSetValue(sub_mult_l_vec, 0.0));
     CeedCall(CeedElemRestrictionApply(mult_elem_rstr, CEED_NOTRANSPOSE, ones_l_vec, ones_e_vec, CEED_REQUEST_IMMEDIATE));
     CeedCall(CeedElemRestrictionApply(mult_elem_rstr, CEED_TRANSPOSE, ones_e_vec, sub_mult_l_vec, CEED_REQUEST_IMMEDIATE));
     CeedCall(CeedVectorGetArrayRead(sub_mult_l_vec, CEED_MEM_HOST, &sub_mult_array));
     // ---- Flag every node present in the current suboperator
-    for (CeedInt j = 0; j < l_vec_len; j++) {
+    for (CeedSize j = 0; j < l_vec_len; j++) {
       if (sub_mult_array[j] > 0.0) mult_array[j] += 1.0;
     }
     CeedCall(CeedVectorRestoreArrayRead(sub_mult_l_vec, &sub_mult_array));
@@ -2499,10 +2741,12 @@ int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fin
 
     CeedCall(CeedOperatorGetActiveBasis(op_fine, &basis_fine));
     CeedCall(CeedBasisCreateProjection(basis_coarse, basis_fine, &basis_c_to_f));
+    CeedCall(CeedBasisDestroy(&basis_fine));
   }
 
   // Core code
-  CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
+  CeedCall(CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong,
+                                                       op_restrict));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2538,7 +2782,10 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_
   CeedCall(CeedOperatorGetActiveBasis(op_fine, &basis_fine));
   CeedCall(CeedBasisGetNumQuadraturePoints(basis_fine, &Q_f));
   CeedCall(CeedBasisGetNumQuadraturePoints(basis_coarse, &Q_c));
-  CeedCheck(Q_f == Q_c, ceed, CEED_ERROR_DIMENSION, "Bases must have compatible quadrature spaces");
+  CeedCheck(Q_f == Q_c, ceed, CEED_ERROR_DIMENSION,
+            "Bases must have compatible quadrature spaces."
+            " Fine grid: %" CeedInt_FMT " points, Coarse grid: %" CeedInt_FMT " points",
+            Q_f, Q_c);
 
   // Create coarse to fine basis, if required
   if (op_prolong || op_restrict) {
@@ -2551,6 +2798,7 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_
     CeedCall(CeedBasisGetDimension(basis_fine, &dim));
     CeedCall(CeedBasisGetNumComponents(basis_fine, &num_comp));
     CeedCall(CeedBasisGetNumNodes1D(basis_fine, &P_1d_f));
+    CeedCall(CeedBasisDestroy(&basis_fine));
     CeedCall(CeedElemRestrictionGetElementSize(rstr_coarse, &num_nodes_c));
     P_1d_c = dim == 1 ? num_nodes_c : dim == 2 ? sqrt(num_nodes_c) : cbrt(num_nodes_c);
     CeedCall(CeedCalloc(P_1d_f, &q_ref));
@@ -2563,7 +2811,9 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_
   }
 
   // Core code
-  CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
+  CeedCall(CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong,
+                                                       op_restrict));
+  CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2614,6 +2864,7 @@ int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_f
     CeedCall(CeedBasisGetDimension(basis_fine, &dim));
     CeedCall(CeedBasisGetNumComponents(basis_fine, &num_comp));
     CeedCall(CeedBasisGetNumNodes(basis_fine, &num_nodes_f));
+    CeedCall(CeedBasisDestroy(&basis_fine));
     CeedCall(CeedElemRestrictionGetElementSize(rstr_coarse, &num_nodes_c));
     CeedCall(CeedCalloc(num_nodes_f * dim, &q_ref));
     CeedCall(CeedCalloc(num_nodes_f, &q_weight));
@@ -2625,7 +2876,9 @@ int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_f
   }
 
   // Core code
-  CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict));
+  CeedCall(CeedOperatorMultigridLevelCreateSingle_Core(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong,
+                                                       op_restrict));
+  CeedCall(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -2652,7 +2905,6 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
   Ceed                 ceed, ceed_parent;
   bool                 interp = false, grad = false, is_tensor_basis = true;
   CeedInt              num_input_fields, P_1d, Q_1d, num_nodes, num_qpts, dim, num_comp = 1, num_elem = 1;
-  CeedSize             l_size = 1;
   CeedScalar          *mass, *laplace, *x, *fdm_interp, *lambda, *elem_avg;
   const CeedScalar    *interp_1d, *grad_1d, *q_weight_1d;
   CeedVector           q_data;
@@ -2673,6 +2925,7 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
     // Operator fallback
     CeedOperator op_fallback;
 
+    CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back for CeedOperatorCreateFDMElementInverse\n");
     CeedCall(CeedOperatorGetFallback(op, &op_fallback));
     if (op_fallback) {
       CeedCall(CeedOperatorCreateFDMElementInverse(op_fallback, fdm_inv, request));
@@ -2698,9 +2951,10 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
       CeedCall(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
       interp = interp || eval_mode == CEED_EVAL_INTERP;
       grad   = grad || eval_mode == CEED_EVAL_GRAD;
-      CeedCall(CeedOperatorFieldGetBasis(op_fields[i], &basis));
-      CeedCall(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
+      if (!basis) CeedCall(CeedOperatorFieldGetBasis(op_fields[i], &basis));
+      if (!rstr) CeedCall(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr));
     }
+    CeedCall(CeedVectorDestroy(&vec));
   }
   CeedCheck(basis, ceed, CEED_ERROR_BACKEND, "No active field set");
   CeedCall(CeedBasisGetNumNodes1D(basis, &P_1d));
@@ -2710,7 +2964,6 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
   CeedCall(CeedBasisGetDimension(basis, &dim));
   CeedCall(CeedBasisGetNumComponents(basis, &num_comp));
   CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
 
   // Build and diagonalize 1D Mass and Laplacian
   CeedCall(CeedBasisIsTensor(basis, &is_tensor_basis));
@@ -2802,8 +3055,9 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
     CeedCall(CeedVectorGetArrayWrite(q_data, CEED_MEM_HOST, &q_data_array));
     for (CeedInt e = 0; e < num_elem; e++) {
       for (CeedInt c = 0; c < num_comp; c++) {
-        for (CeedInt n = 0; n < num_nodes; n++)
+        for (CeedInt n = 0; n < num_nodes; n++) {
           q_data_array[(e * num_comp + c) * num_nodes + n] = 1. / (elem_avg[e] * fdm_diagonal[c * num_nodes + n]);
+        }
       }
     }
     CeedCall(CeedFree(&elem_avg));
@@ -2830,7 +3084,8 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
   // -- Restriction
   {
     CeedInt strides[3] = {1, num_nodes, num_nodes * num_comp};
-    CeedCall(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, num_nodes, num_comp, num_elem * num_comp * num_nodes, strides, &rstr_qd_i));
+    CeedCall(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, num_nodes, num_comp,
+                                              (CeedSize)num_elem * (CeedSize)num_comp * (CeedSize)num_nodes, strides, &rstr_qd_i));
   }
 
   // -- QFunction
@@ -2859,9 +3114,14 @@ int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv,
   CeedCall(CeedOperatorSetField(*fdm_inv, "output", rstr, fdm_basis, CEED_VECTOR_ACTIVE));
 
   // Cleanup
+  CeedCall(CeedDestroy(&ceed));
+  CeedCall(CeedDestroy(&ceed_parent));
   CeedCall(CeedVectorDestroy(&q_data));
-  CeedCall(CeedBasisDestroy(&fdm_basis));
+  CeedCall(CeedElemRestrictionDestroy(&rstr));
   CeedCall(CeedElemRestrictionDestroy(&rstr_qd_i));
+  CeedCall(CeedBasisDestroy(&basis));
+  CeedCall(CeedBasisDestroy(&fdm_basis));
+  CeedCall(CeedQFunctionDestroy(&qf));
   CeedCall(CeedQFunctionDestroy(&qf_fdm));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-qfunction-register.c b/interface/ceed-qfunction-register.c
index 3558d0a225..eb3832c4f5 100644
--- a/interface/ceed-qfunction-register.c
+++ b/interface/ceed-qfunction-register.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -32,7 +32,7 @@ int CeedQFunctionRegisterAll(void) {
 
   CeedPragmaCritical(CeedQFunctionRegisterAll) {
     if (!register_all_called) {
-      CeedDebugEnv256(1, "\n---------- Registering Gallery QFunctions ----------\n");
+      CeedDebugEnv256(CEED_DEBUG_COLOR_SUCCESS, "\n---------- Registering Gallery QFunctions ----------\n");
 #define CEED_GALLERY_QFUNCTION(name) \
   if (!ierr) ierr = name();
 #include "../gallery/ceed-gallery-list.h"
diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c
index 226b33c19e..8f2ffbbd70 100644
--- a/interface/ceed-qfunction.c
+++ b/interface/ceed-qfunction.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -117,13 +117,14 @@ static int CeedQFunctionFieldSet(CeedQFunctionField *f, const char *field_name,
   @param[in] field        `CeedQFunction` field to view
   @param[in] field_number Number of field being viewed
   @param[in] in           true for input field, false for output
+  @param[in] tabs         Tabs to append before each new line
   @param[in] stream       Stream to view to, e.g., `stdout`
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref Utility
 **/
-static int CeedQFunctionFieldView(CeedQFunctionField field, CeedInt field_number, bool in, FILE *stream) {
+static int CeedQFunctionFieldView(CeedQFunctionField field, CeedInt field_number, bool in, const char *tabs, FILE *stream) {
   const char  *inout = in ? "Input" : "Output";
   const char  *field_name;
   CeedInt      size;
@@ -131,13 +132,42 @@ static int CeedQFunctionFieldView(CeedQFunctionField field, CeedInt field_number
 
   CeedCall(CeedQFunctionFieldGetData(field, &field_name, &size, &eval_mode));
   fprintf(stream,
-          "    %s field %" CeedInt_FMT
-          ":\n"
-          "      Name: \"%s\"\n"
+          "%s    %s field %" CeedInt_FMT
+          ":\n%s"
+          "      Name: \"%s\"\n%s"
           "      Size: %" CeedInt_FMT
-          "\n"
+          "\n%s"
           "      EvalMode: \"%s\"\n",
-          inout, field_number, field_name, size, CeedEvalModes[eval_mode]);
+          tabs, inout, field_number, tabs, field_name, tabs, size, tabs, CeedEvalModes[eval_mode]);
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief View a `CeedQFunction` passed as a `CeedObject`
+
+  @param[in] qf     `CeedQFunction` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedQFunctionView_Object(CeedObject qf, FILE *stream) {
+  CeedCall(CeedQFunctionView((CeedQFunction)qf, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Destroy a `CeedQFunction` passed as a `CeedObject`
+
+  @param[in,out] qf Address of `CeedQFunction` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedQFunctionDestroy_Object(CeedObject *qf) {
+  CeedCall(CeedQFunctionDestroy((CeedQFunction *)qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -196,11 +226,31 @@ int CeedQFunctionGetNumArgs(CeedQFunction qf, CeedInt *num_input, CeedInt *num_o
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get the name of the `CeedQFunction`.
+    Use the `name` if created via @ref CeedQFunctionCreateInteriorByName(), otherwise return the kernel name via @ref CeedQFunctionGetKernelName().
+
+  @param[in]  qf   `CeedQFunction`
+  @param[out] name Variable to store `CeedQFunction` name
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedQFunctionGetName(CeedQFunction qf, const char **name) {
+  if (qf->is_gallery) {
+    *name = qf->gallery_name;
+  } else {
+    CeedCall(CeedQFunctionGetKernelName(qf, name));
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get the name of the user function for a `CeedQFunction`
 
   @param[in]  qf          `CeedQFunction`
-  @param[out] kernel_name Variable to store source path string
+  @param[out] kernel_name Variable to store string holding kernel name
 
   @return An error code: 0 - success, otherwise - failure
 
@@ -252,6 +302,7 @@ int CeedQFunctionGetSourcePath(CeedQFunction qf, const char **source_path) {
     } else {
       CeedCall(CeedGetJitAbsolutePath(ceed, qf->user_source, &absolute_path));
     }
+    CeedCall(CeedDestroy(&ceed));
 
     size_t source_len = strlen(absolute_path) - kernel_name_len - 1;
 
@@ -295,6 +346,7 @@ int CeedQFunctionLoadSourceToBuffer(CeedQFunction qf, const char **source_buffer
 
     CeedCall(CeedQFunctionGetCeed(qf, &ceed));
     CeedCall(CeedLoadSourceToBuffer(ceed, source_path, &buffer));
+    CeedCall(CeedDestroy(&ceed));
     *source_buffer = buffer;
   }
   return CEED_ERROR_SUCCESS;
@@ -328,7 +380,8 @@ int CeedQFunctionGetUserFunction(CeedQFunction qf, CeedQFunctionUser *f) {
   @ref Backend
 **/
 int CeedQFunctionGetContext(CeedQFunction qf, CeedQFunctionContext *ctx) {
-  *ctx = qf->ctx;
+  *ctx = NULL;
+  if (qf->ctx) CeedCall(CeedQFunctionContextReferenceCopy(qf->ctx, ctx));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -359,6 +412,7 @@ int CeedQFunctionGetContextData(CeedQFunction qf, CeedMemType mem_type, void *da
   } else {
     *(void **)data = NULL;
   }
+  CeedCall(CeedQFunctionContextDestroy(&ctx));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -385,7 +439,7 @@ int CeedQFunctionRestoreContextData(CeedQFunction qf, void *data) {
       CeedCall(CeedQFunctionContextRestoreDataRead(ctx, data));
     }
   }
-  *(void **)data = NULL;
+  CeedCall(CeedQFunctionContextDestroy(&ctx));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -413,6 +467,7 @@ int CeedQFunctionGetInnerContext(CeedQFunction qf, CeedQFunctionContext *ctx) {
   } else {
     *ctx = qf_ctx;
   }
+  CeedCall(CeedQFunctionContextDestroy(&qf_ctx));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -572,7 +627,7 @@ int CeedQFunctionSetImmutable(CeedQFunction qf) {
   @ref Backend
 **/
 int CeedQFunctionReference(CeedQFunction qf) {
-  qf->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)qf));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -608,6 +663,8 @@ int CeedQFunctionGetFlopsEstimate(CeedQFunction qf, CeedSize *flops) {
   @param[in]  source     Absolute path to source of `CeedQFunctionUser`, "\abs_path\file.h:function_name".
                            The entire source file must only contain constructs supported by all targeted backends (i.e. CUDA for `/gpu/cuda`, OpenCL/SYCL for `/gpu/sycl`, etc.).
                            The entire contents of this file and all locally included files are used during JiT compilation for GPU backends.
+                           The header `ceed/types.h` is preferred over `ceed.h` or `ceed/ceed.h` for `CeedQFunction` source files.
+                           The macro `CEED_RUNNING_JIT_PASS` is set during JiT and can be used to guard include statements that JiT compilers cannot use, such as `math.h` or `std*.h`.
                            All source files must be at the provided filepath at runtime for JiT to function.
   @param[out] qf         Address of the variable where the newly created `CeedQFunction` will be stored
 
@@ -624,8 +681,9 @@ int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, CeedQFunctionUser
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "QFunction"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionCreateInterior");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedQFunctionCreateInterior");
     CeedCall(CeedQFunctionCreateInterior(delegate, vec_length, f, source, qf));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
@@ -633,8 +691,7 @@ int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, CeedQFunctionUser
             "Provided path to source does not include function name. Provided: \"%s\"\nRequired: \"\\abs_path\\file.h:function_name\"", source);
 
   CeedCall(CeedCalloc(1, qf));
-  CeedCall(CeedReferenceCopy(ceed, &(*qf)->ceed));
-  (*qf)->ref_count           = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedQFunctionView_Object, CeedQFunctionDestroy_Object, &(*qf)->obj));
   (*qf)->vec_length          = vec_length;
   (*qf)->is_identity         = false;
   (*qf)->is_context_writable = true;
@@ -725,6 +782,7 @@ int CeedQFunctionCreateIdentity(Ceed ceed, CeedInt size, CeedEvalMode in_mode, C
   CeedCall(CeedQFunctionGetContext(*qf, &ctx));
   CeedCall(CeedQFunctionContextGetFieldLabel(ctx, "size", &size_label));
   CeedCall(CeedQFunctionContextSetInt32(ctx, size_label, &size));
+  CeedCall(CeedQFunctionContextDestroy(&ctx));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -755,30 +813,37 @@ int CeedQFunctionReferenceCopy(CeedQFunction qf, CeedQFunction *qf_copy) {
 
   @param[in,out] qf         `CeedQFunction`
   @param[in]     field_name Name of `CeedQFunction` field
-  @param[in]     size       Size of `CeedQFunction` field, (`num_comp * 1`) for @ref CEED_EVAL_NONE, (`num_comp * 1`) for @ref CEED_EVAL_INTERP for an \f$H^1\f$ space or (`num_comp * dim`) for an \f$H(\mathrm{div})\f$ or \f$H(\mathrm{curl})\f$ space, (`num_comp * dim`) for @ref CEED_EVAL_GRAD, or (`num_comp * 1`) for @ref CEED_EVAL_DIV, and (`num_comp * curl_dim`) with `curl_dim = 1` if `dim < 3` otherwise `curl_dim = dim` for @ref CEED_EVAL_CURL.
+  @param[in]     size       Size of `CeedQFunction` field,
+                              (`num_comp * 1`) for @ref CEED_EVAL_NONE,
+                              (`num_comp * 1`) for @ref CEED_EVAL_INTERP for an \f$H^1\f$ space or (`num_comp * dim`) for an \f$H(\mathrm{div})\f$ or \f$H(\mathrm{curl})\f$ space,
+                              (`num_comp * dim`) for @ref CEED_EVAL_GRAD,
+                              (`num_comp * 1`) for @ref CEED_EVAL_DIV, and
+                              (`num_comp * curl_dim`) with `curl_dim = 1` if `dim < 3` otherwise `curl_dim = dim` for @ref CEED_EVAL_CURL.
   @param[in]     eval_mode  @ref CEED_EVAL_NONE to use values directly,
                               @ref CEED_EVAL_INTERP to use interpolated values,
                               @ref CEED_EVAL_GRAD to use gradients,
                               @ref CEED_EVAL_DIV to use divergence,
                               @ref CEED_EVAL_CURL to use curl
 
+  Note: In the user `CeedQFunctionUser`, the `in` argument list the fields in the order given by the calls to `CeedQFunctionAddInput`.
+
   @return An error code: 0 - success, otherwise - failure
 
   @ref User
 **/
 int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, CeedInt size, CeedEvalMode eval_mode) {
   bool is_immutable;
-  Ceed ceed;
 
-  CeedCall(CeedQFunctionGetCeed(qf, &ceed));
   CeedCall(CeedQFunctionIsImmutable(qf, &is_immutable));
-  CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "QFunction cannot be changed after set as immutable");
-  CeedCheck(eval_mode != CEED_EVAL_WEIGHT || size == 1, ceed, CEED_ERROR_DIMENSION, "CEED_EVAL_WEIGHT should have size 1");
+  CeedCheck(!is_immutable, CeedQFunctionReturnCeed(qf), CEED_ERROR_MAJOR, "QFunction cannot be changed after set as immutable");
+  CeedCheck(eval_mode != CEED_EVAL_WEIGHT || size == 1, CeedQFunctionReturnCeed(qf), CEED_ERROR_DIMENSION, "CEED_EVAL_WEIGHT should have size 1");
   for (CeedInt i = 0; i < qf->num_input_fields; i++) {
-    CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), ceed, CEED_ERROR_MINOR, "CeedQFunction field names must be unique");
+    CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR,
+              "CeedQFunction field names must be unique. Duplicate name: %s", field_name);
   }
   for (CeedInt i = 0; i < qf->num_output_fields; i++) {
-    CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), ceed, CEED_ERROR_MINOR, "CeedQFunction field names must be unique");
+    CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR,
+              "CeedQFunction field names must be unique. Duplicate name: %s", field_name);
   }
   CeedCall(CeedQFunctionFieldSet(&qf->input_fields[qf->num_input_fields], field_name, size, eval_mode));
   qf->num_input_fields++;
@@ -790,30 +855,38 @@ int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, CeedInt size
 
   @param[in,out] qf         `CeedQFunction`
   @param[in]     field_name Name of `CeedQFunction` field
-  @param[in]     size       Size of `CeedQFunction` field, (`num_comp * 1`) for @ref CEED_EVAL_NONE, (`num_comp * 1`) for @ref CEED_EVAL_INTERP for an \f$H^1\f$ space or (`num_comp * dim`) for an \f$H(\mathrm{div})\f$ or \f$H(\mathrm{curl})\f$ space, (`num_comp * dim`) for @ref CEED_EVAL_GRAD, or (`num_comp * 1`) for @ref CEED_EVAL_DIV, and (`num_comp * curl_dim`) with `curl_dim = 1` if `dim < 3` else dim for @ref CEED_EVAL_CURL.
+  @param[in]     size       Size of `CeedQFunction` field,
+                              (`num_comp * 1`) for @ref CEED_EVAL_NONE,
+                              (`num_comp * 1`) for @ref CEED_EVAL_INTERP for an \f$H^1\f$ space or (`num_comp * dim`) for an \f$H(\mathrm{div})\f$ or \f$H(\mathrm{curl})\f$ space,
+                              (`num_comp * dim`) for @ref CEED_EVAL_GRAD,
+                              (`num_comp * 1`) for @ref CEED_EVAL_DIV, and
+                              (`num_comp * curl_dim`) with `curl_dim = 1` if `dim < 3` otherwise `curl_dim = dim` for @ref CEED_EVAL_CURL.
   @param[in]     eval_mode  @ref CEED_EVAL_NONE to use values directly,
                               @ref CEED_EVAL_INTERP to use interpolated values,
                               @ref CEED_EVAL_GRAD to use gradients,
                               @ref CEED_EVAL_DIV to use divergence,
                               @ref CEED_EVAL_CURL to use curl.
 
+  Note: In the user `CeedQFunctionUser`, the `out` argument list the fields in the order given by the calls to `CeedQFunctionAddOutput`.
+
   @return An error code: 0 - success, otherwise - failure
 
   @ref User
 **/
 int CeedQFunctionAddOutput(CeedQFunction qf, const char *field_name, CeedInt size, CeedEvalMode eval_mode) {
   bool is_immutable;
-  Ceed ceed;
 
-  CeedCall(CeedQFunctionGetCeed(qf, &ceed));
   CeedCall(CeedQFunctionIsImmutable(qf, &is_immutable));
-  CeedCheck(!is_immutable, ceed, CEED_ERROR_MAJOR, "CeedQFunction cannot be changed after set as immutable");
-  CeedCheck(eval_mode != CEED_EVAL_WEIGHT, ceed, CEED_ERROR_DIMENSION, "Cannot create CeedQFunction output with CEED_EVAL_WEIGHT");
+  CeedCheck(!is_immutable, CeedQFunctionReturnCeed(qf), CEED_ERROR_MAJOR, "CeedQFunction cannot be changed after set as immutable");
+  CeedCheck(eval_mode != CEED_EVAL_WEIGHT, CeedQFunctionReturnCeed(qf), CEED_ERROR_DIMENSION,
+            "Cannot create CeedQFunction output with CEED_EVAL_WEIGHT");
   for (CeedInt i = 0; i < qf->num_input_fields; i++) {
-    CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), ceed, CEED_ERROR_MINOR, "CeedQFunction field names must be unique");
+    CeedCheck(strcmp(field_name, qf->input_fields[i]->field_name), CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR,
+              "CeedQFunction field names must be unique");
   }
   for (CeedInt i = 0; i < qf->num_output_fields; i++) {
-    CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), ceed, CEED_ERROR_MINOR, "CeedQFunction field names must be unique");
+    CeedCheck(strcmp(field_name, qf->output_fields[i]->field_name), CeedQFunctionReturnCeed(qf), CEED_ERROR_MINOR,
+              "CeedQFunction field names must be unique");
   }
   CeedCall(CeedQFunctionFieldSet(&qf->output_fields[qf->num_output_fields], field_name, size, eval_mode));
   qf->num_output_fields++;
@@ -966,6 +1039,36 @@ int CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the number of tabs to indent for @ref CeedQFunctionView() output
+
+  @param[in] qf       `CeedQFunction` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedQFunctionSetNumViewTabs(CeedQFunction qf, CeedInt num_tabs) {
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)qf, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get the number of tabs to indent for @ref CeedQFunctionView() output
+
+  @param[in]  qf       `CeedQFunction` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedQFunctionGetNumViewTabs(CeedQFunction qf, CeedInt *num_tabs) {
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)qf, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedQFunction`
 
@@ -977,20 +1080,30 @@ int CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops) {
   @ref User
 **/
 int CeedQFunctionView(CeedQFunction qf, FILE *stream) {
-  const char *kernel_name;
+  char       *tabs = NULL;
+  const char *name;
+
+  {
+    CeedInt num_tabs = 0;
+
+    CeedCall(CeedQFunctionGetNumViewTabs(qf, &num_tabs));
+    CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
+  }
 
-  CeedCall(CeedQFunctionGetKernelName(qf, &kernel_name));
-  fprintf(stream, "%sCeedQFunction - %s\n", qf->is_gallery ? "Gallery " : "User ", qf->is_gallery ? qf->gallery_name : kernel_name);
+  CeedCall(CeedQFunctionGetName(qf, &name));
+  fprintf(stream, "%s%sCeedQFunction - %s\n", tabs, qf->is_gallery ? "Gallery " : "User ", name);
 
-  fprintf(stream, "  %" CeedInt_FMT " input field%s:\n", qf->num_input_fields, qf->num_input_fields > 1 ? "s" : "");
+  fprintf(stream, "%s  %" CeedInt_FMT " input field%s:\n", tabs, qf->num_input_fields, qf->num_input_fields > 1 ? "s" : "");
   for (CeedInt i = 0; i < qf->num_input_fields; i++) {
-    CeedCall(CeedQFunctionFieldView(qf->input_fields[i], i, 1, stream));
+    CeedCall(CeedQFunctionFieldView(qf->input_fields[i], i, 1, tabs, stream));
   }
 
-  fprintf(stream, "  %" CeedInt_FMT " output field%s:\n", qf->num_output_fields, qf->num_output_fields > 1 ? "s" : "");
+  fprintf(stream, "%s  %" CeedInt_FMT " output field%s:\n", tabs, qf->num_output_fields, qf->num_output_fields > 1 ? "s" : "");
   for (CeedInt i = 0; i < qf->num_output_fields; i++) {
-    CeedCall(CeedQFunctionFieldView(qf->output_fields[i], i, 0, stream));
+    CeedCall(CeedQFunctionFieldView(qf->output_fields[i], i, 0, tabs, stream));
   }
+  CeedCall(CeedFree(&tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1005,7 +1118,7 @@ int CeedQFunctionView(CeedQFunction qf, FILE *stream) {
   @ref Advanced
 **/
 int CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed) {
-  *ceed = CeedQFunctionReturnCeed(qf);
+  CeedCall(CeedObjectGetCeed((CeedObject)qf, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1018,7 +1131,7 @@ int CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed) {
 
   @ref Advanced
 **/
-Ceed CeedQFunctionReturnCeed(CeedQFunction qf) { return qf->ceed; }
+Ceed CeedQFunctionReturnCeed(CeedQFunction qf) { return CeedObjectReturnCeed((CeedObject)qf); }
 
 /**
   @brief Apply the action of a `CeedQFunction`
@@ -1036,13 +1149,11 @@ Ceed CeedQFunctionReturnCeed(CeedQFunction qf) { return qf->ceed; }
 **/
 int CeedQFunctionApply(CeedQFunction qf, CeedInt Q, CeedVector *u, CeedVector *v) {
   CeedInt vec_length;
-  Ceed    ceed;
 
-  CeedCall(CeedQFunctionGetCeed(qf, &ceed));
-  CeedCheck(qf->Apply, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionApply");
+  CeedCheck(qf->Apply, CeedQFunctionReturnCeed(qf), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionApply");
   CeedCall(CeedQFunctionGetVectorLength(qf, &vec_length));
-  CeedCheck(Q % vec_length == 0, ceed, CEED_ERROR_DIMENSION, "Number of quadrature points %" CeedInt_FMT " must be a multiple of %" CeedInt_FMT, Q,
-            qf->vec_length);
+  CeedCheck(Q % vec_length == 0, CeedQFunctionReturnCeed(qf), CEED_ERROR_DIMENSION,
+            "Number of quadrature points %" CeedInt_FMT " must be a multiple of %" CeedInt_FMT, Q, qf->vec_length);
   CeedCall(CeedQFunctionSetImmutable(qf));
   CeedCall(qf->Apply(qf, Q, u, v));
   return CEED_ERROR_SUCCESS;
@@ -1058,7 +1169,7 @@ int CeedQFunctionApply(CeedQFunction qf, CeedInt Q, CeedVector *u, CeedVector *v
   @ref User
 **/
 int CeedQFunctionDestroy(CeedQFunction *qf) {
-  if (!*qf || --(*qf)->ref_count > 0) {
+  if (!*qf || CeedObjectDereference((CeedObject)*qf) > 0) {
     *qf = NULL;
     return CEED_ERROR_SUCCESS;
   }
@@ -1085,7 +1196,7 @@ int CeedQFunctionDestroy(CeedQFunction *qf) {
   CeedCall(CeedFree(&(*qf)->source_path));
   CeedCall(CeedFree(&(*qf)->gallery_name));
   CeedCall(CeedFree(&(*qf)->kernel_name));
-  CeedCall(CeedDestroy(&(*qf)->ceed));
+  CeedCall(CeedObjectDestroy_Private(&(*qf)->obj));
   CeedCall(CeedFree(qf));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-qfunctioncontext.c b/interface/ceed-qfunctioncontext.c
index ddb9549fa4..48563a9999 100644
--- a/interface/ceed-qfunctioncontext.c
+++ b/interface/ceed-qfunctioncontext.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -59,12 +59,11 @@ int CeedQFunctionContextRegisterGeneric(CeedQFunctionContext ctx, const char *fi
                                         CeedContextFieldType field_type, size_t num_values) {
   size_t  field_size  = 0;
   CeedInt field_index = -1;
-  Ceed    ceed;
 
   // Check for duplicate
-  CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCall(CeedQFunctionContextGetFieldIndex(ctx, field_name, &field_index));
-  CeedCheck(field_index == -1, ceed, CEED_ERROR_UNSUPPORTED, "QFunctionContext field with name \"%s\" already registered", field_name);
+  CeedCheck(field_index == -1, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED,
+            "QFunctionContext field with name \"%s\" already registered", field_name);
 
   // Allocate space for field data
   if (ctx->num_fields == 0) {
@@ -128,6 +127,35 @@ static int CeedQFunctionContextDestroyData(CeedQFunctionContext ctx) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief View a `CeedQFunctionContext` passed as a `CeedObject`
+
+  @param[in] ctx    `CeedQFunctionContext` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedQFunctionContextView_Object(CeedObject ctx, FILE *stream) {
+  CeedCall(CeedQFunctionContextView((CeedQFunctionContext)ctx, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Destroy a `CeedQFunctionContext` passed as a `CeedObject`
+
+  @param[in,out] ctx Address of `CeedQFunctionContext` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedQFunctionContextDestroy_Object(CeedObject *ctx) {
+  CeedCall(CeedQFunctionContextDestroy((CeedQFunctionContext *)ctx));
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -147,7 +175,7 @@ static int CeedQFunctionContextDestroyData(CeedQFunctionContext ctx) {
   @ref Backend
 **/
 int CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed) {
-  *ceed = CeedQFunctionContextReturnCeed(ctx);
+  CeedCall(CeedObjectGetCeed((CeedObject)ctx, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -160,7 +188,7 @@ int CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed) {
 
   @ref Backend
 **/
-Ceed CeedQFunctionContextReturnCeed(CeedQFunctionContext ctx) { return ctx->ceed; }
+Ceed CeedQFunctionContextReturnCeed(CeedQFunctionContext ctx) { return CeedObjectReturnCeed((CeedObject)ctx); }
 
 /**
   @brief Check for valid data in a `CeedQFunctionContext`
@@ -542,7 +570,7 @@ int CeedQFunctionContextGetDataDestroy(CeedQFunctionContext ctx, CeedMemType *f_
   @ref Backend
 **/
 int CeedQFunctionContextReference(CeedQFunctionContext ctx) {
-  ctx->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)ctx));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -569,14 +597,14 @@ int CeedQFunctionContextCreate(Ceed ceed, CeedQFunctionContext *ctx) {
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Context"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextCreate");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedQFunctionContextCreate");
     CeedCall(CeedQFunctionContextCreate(delegate, ctx));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
   CeedCall(CeedCalloc(1, ctx));
-  CeedCall(CeedReferenceCopy(ceed, &(*ctx)->ceed));
-  (*ctx)->ref_count = 1;
+  CeedCall(CeedObjectCreate(ceed, CeedQFunctionContextView_Object, CeedQFunctionContextDestroy_Object, &(*ctx)->obj));
   CeedCall(ceed->QFunctionContextCreate(*ctx));
   return CEED_ERROR_SUCCESS;
 }
@@ -620,11 +648,9 @@ int CeedQFunctionContextReferenceCopy(CeedQFunctionContext ctx, CeedQFunctionCon
   @ref User
 **/
 int CeedQFunctionContextSetData(CeedQFunctionContext ctx, CeedMemType mem_type, CeedCopyMode copy_mode, size_t size, void *data) {
-  Ceed ceed;
-
-  CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed));
-  CeedCheck(ctx->SetData, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextSetData");
-  CeedCheck(ctx->state % 2 == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
+  CeedCheck(ctx->SetData, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextSetData");
+  CeedCheck(ctx->state % 2 == 0, CeedQFunctionContextReturnCeed(ctx), 1,
+            "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
 
   CeedCall(CeedQFunctionContextDestroyData(ctx));
   ctx->ctx_size = size;
@@ -650,17 +676,16 @@ int CeedQFunctionContextSetData(CeedQFunctionContext ctx, CeedMemType mem_type,
 int CeedQFunctionContextTakeData(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) {
   void *temp_data      = NULL;
   bool  has_valid_data = true, has_borrowed_data_of_type = true;
-  Ceed  ceed;
 
-  CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedCall(CeedQFunctionContextHasValidData(ctx, &has_valid_data));
-  CeedCheck(has_valid_data, ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to take, must set data");
+  CeedCheck(has_valid_data, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to take, must set data");
 
-  CeedCheck(ctx->TakeData, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextTakeData");
-  CeedCheck(ctx->state % 2 == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
+  CeedCheck(ctx->TakeData, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextTakeData");
+  CeedCheck(ctx->state % 2 == 0, CeedQFunctionContextReturnCeed(ctx), 1,
+            "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
 
   CeedCall(CeedQFunctionContextHasBorrowedDataOfType(ctx, mem_type, &has_borrowed_data_of_type));
-  CeedCheck(has_borrowed_data_of_type, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_borrowed_data_of_type, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND,
             "CeedQFunctionContext has no borrowed %s data, must set data with CeedQFunctionContextSetData", CeedMemTypes[mem_type]);
 
   CeedCall(ctx->TakeData(ctx, mem_type, &temp_data));
@@ -687,15 +712,15 @@ int CeedQFunctionContextTakeData(CeedQFunctionContext ctx, CeedMemType mem_type,
 **/
 int CeedQFunctionContextGetData(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) {
   bool has_valid_data = true;
-  Ceed ceed;
 
-  CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed));
-  CeedCheck(ctx->GetData, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextGetData");
-  CeedCheck(ctx->state % 2 == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
-  CeedCheck(ctx->num_readers == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, a process has read access");
+  CeedCheck(ctx->GetData, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextGetData");
+  CeedCheck(ctx->state % 2 == 0, CeedQFunctionContextReturnCeed(ctx), 1,
+            "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
+  CeedCheck(ctx->num_readers == 0, CeedQFunctionContextReturnCeed(ctx), 1,
+            "Cannot grant CeedQFunctionContext data access, a process has read access");
 
   CeedCall(CeedQFunctionContextHasValidData(ctx, &has_valid_data));
-  CeedCheck(has_valid_data, ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data");
+  CeedCheck(has_valid_data, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data");
 
   CeedCall(ctx->GetData(ctx, mem_type, data));
   ctx->state++;
@@ -721,14 +746,14 @@ int CeedQFunctionContextGetData(CeedQFunctionContext ctx, CeedMemType mem_type,
 **/
 int CeedQFunctionContextGetDataRead(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) {
   bool has_valid_data = true;
-  Ceed ceed;
 
-  CeedCall(CeedQFunctionContextGetCeed(ctx, &ceed));
-  CeedCheck(ctx->GetDataRead, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedQFunctionContextGetDataRead");
-  CeedCheck(ctx->state % 2 == 0, ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
+  CeedCheck(ctx->GetDataRead, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_UNSUPPORTED,
+            "Backend does not support CeedQFunctionContextGetDataRead");
+  CeedCheck(ctx->state % 2 == 0, CeedQFunctionContextReturnCeed(ctx), 1,
+            "Cannot grant CeedQFunctionContext data access, the access lock is already in use");
 
   CeedCall(CeedQFunctionContextHasValidData(ctx, &has_valid_data));
-  CeedCheck(has_valid_data, ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data");
+  CeedCheck(has_valid_data, CeedQFunctionContextReturnCeed(ctx), CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data");
 
   CeedCall(ctx->GetDataRead(ctx, mem_type, data));
   ctx->num_readers++;
@@ -883,6 +908,36 @@ int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_siz
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the number of tabs to indent for @ref CeedQFunctionContextView() output
+
+  @param[in] ctx      `CeedQFunctionContext` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedQFunctionContextSetNumViewTabs(CeedQFunctionContext ctx, CeedInt num_tabs) {
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)ctx, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get the number of tabs to indent for @ref CeedQFunctionContextView() output
+
+  @param[in]  ctx      `CeedQFunctionContext` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedQFunctionContextGetNumViewTabs(CeedQFunctionContext ctx, CeedInt *num_tabs) {
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)ctx, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedQFunctionContext`
 
@@ -894,11 +949,22 @@ int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_siz
   @ref User
 **/
 int CeedQFunctionContextView(CeedQFunctionContext ctx, FILE *stream) {
-  fprintf(stream, "CeedQFunctionContext\n");
-  fprintf(stream, "  Context Data Size: %zu\n", ctx->ctx_size);
+  char *tabs = NULL;
+
+  {
+    CeedInt num_tabs = 0;
+
+    CeedCall(CeedQFunctionContextGetNumViewTabs(ctx, &num_tabs));
+    CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
+  }
+
+  fprintf(stream, "%sCeedQFunctionContext\n", tabs);
+  fprintf(stream, "%s  Context Data Size: %zu\n", tabs, ctx->ctx_size);
   for (CeedInt i = 0; i < ctx->num_fields; i++) {
-    fprintf(stream, "  Labeled %s field: %s\n", CeedContextFieldTypes[ctx->field_labels[i]->type], ctx->field_labels[i]->name);
+    fprintf(stream, "%s  Labeled %s field: %s\n", tabs, CeedContextFieldTypes[ctx->field_labels[i]->type], ctx->field_labels[i]->name);
   }
+  CeedCall(CeedFree(&tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -930,11 +996,11 @@ int CeedQFunctionContextSetDataDestroy(CeedQFunctionContext ctx, CeedMemType f_m
   @ref User
 **/
 int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx) {
-  if (!*ctx || --(*ctx)->ref_count > 0) {
+  if (!*ctx || CeedObjectDereference((CeedObject)*ctx) > 0) {
     *ctx = NULL;
     return CEED_ERROR_SUCCESS;
   }
-  CeedCheck(((*ctx)->state % 2) == 0, (*ctx)->ceed, 1, "Cannot destroy CeedQFunctionContext, the access lock is in use");
+  CeedCheck(((*ctx)->state % 2) == 0, CeedQFunctionContextReturnCeed(*ctx), 1, "Cannot destroy CeedQFunctionContext, the access lock is in use");
 
   CeedCall(CeedQFunctionContextDestroyData(*ctx));
   if ((*ctx)->Destroy) CeedCall((*ctx)->Destroy(*ctx));
@@ -944,7 +1010,7 @@ int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx) {
     CeedCall(CeedFree(&(*ctx)->field_labels[i]));
   }
   CeedCall(CeedFree(&(*ctx)->field_labels));
-  CeedCall(CeedDestroy(&(*ctx)->ceed));
+  CeedCall(CeedObjectDestroy_Private(&(*ctx)->obj));
   CeedCall(CeedFree(ctx));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-register.c b/interface/ceed-register.c
index bdc8a95d10..759a6463fb 100644
--- a/interface/ceed-register.c
+++ b/interface/ceed-register.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -32,7 +32,7 @@ int CeedRegisterAll(void) {
 
   CeedPragmaCritical(CeedRegisterAll) {
     if (!register_all_called) {
-      CeedDebugEnv256(1, "\n---------- Registering Backends ----------\n");
+      CeedDebugEnv256(CEED_DEBUG_COLOR_SUCCESS, "\n---------- Registering Backends ----------\n");
 #define CEED_BACKEND(name, ...) \
   if (!ierr) ierr = name();
 #include "../backends/ceed-backend-list.h"
diff --git a/interface/ceed-tensor.c b/interface/ceed-tensor.c
index 7cbc69e00c..24f3687c62 100644
--- a/interface/ceed-tensor.c
+++ b/interface/ceed-tensor.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -13,6 +13,28 @@
 /// @file
 /// Implementation of CeedTensorContract interfaces
 
+/// ----------------------------------------------------------------------------
+/// CeedTensorContract Library Internal Functions
+/// ----------------------------------------------------------------------------
+/// @addtogroup CeedTensorContractDeveloper
+/// @{
+
+/**
+  @brief Destroy a `CeedTensorContract` passed as a `CeedObject`
+
+  @param[in,out] contract Address of `CeedTensorContract` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedTensorContractDestroy_Object(CeedObject *contract) {
+  CeedCall(CeedTensorContractDestroy((CeedTensorContract *)contract));
+  return CEED_ERROR_SUCCESS;
+}
+
+/// @}
+
 /// ----------------------------------------------------------------------------
 /// CeedTensorContract Backend API
 /// ----------------------------------------------------------------------------
@@ -34,13 +56,14 @@ int CeedTensorContractCreate(Ceed ceed, CeedTensorContract *contract) {
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "TensorContract"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedTensorContractCreate");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement CeedTensorContractCreate");
     CeedCall(CeedTensorContractCreate(delegate, contract));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
   CeedCall(CeedCalloc(1, contract));
-  CeedCall(CeedReferenceCopy(ceed, &(*contract)->ceed));
+  CeedCall(CeedObjectCreate(ceed, NULL, CeedTensorContractDestroy_Object, &(*contract)->obj));
   CeedCall(ceed->TensorContractCreate(*contract));
   return CEED_ERROR_SUCCESS;
 }
@@ -123,7 +146,7 @@ int CeedTensorContractStridedApply(CeedTensorContract contract, CeedInt A, CeedI
   @ref Backend
 **/
 int CeedTensorContractGetCeed(CeedTensorContract contract, Ceed *ceed) {
-  *ceed = CeedTensorContractReturnCeed(contract);
+  CeedCall(CeedObjectGetCeed((CeedObject)contract, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -136,7 +159,7 @@ int CeedTensorContractGetCeed(CeedTensorContract contract, Ceed *ceed) {
 
   @ref Backend
 **/
-Ceed CeedTensorContractReturnCeed(CeedTensorContract contract) { return contract->ceed; }
+Ceed CeedTensorContractReturnCeed(CeedTensorContract contract) { return CeedObjectReturnCeed((CeedObject)contract); }
 
 /**
   @brief Get backend data of a `CeedTensorContract`
@@ -178,7 +201,7 @@ int CeedTensorContractSetData(CeedTensorContract contract, void *data) {
   @ref Backend
 **/
 int CeedTensorContractReference(CeedTensorContract contract) {
-  contract->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)contract));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -214,14 +237,14 @@ int CeedTensorContractReferenceCopy(CeedTensorContract tensor, CeedTensorContrac
   @ref Backend
 **/
 int CeedTensorContractDestroy(CeedTensorContract *contract) {
-  if (!*contract || --(*contract)->ref_count > 0) {
+  if (!*contract || CeedObjectDereference((CeedObject)*contract) > 0) {
     *contract = NULL;
     return CEED_ERROR_SUCCESS;
   }
   if ((*contract)->Destroy) {
     CeedCall((*contract)->Destroy(*contract));
   }
-  CeedCall(CeedDestroy(&(*contract)->ceed));
+  CeedCall(CeedObjectDestroy_Private(&(*contract)->obj));
   CeedCall(CeedFree(contract));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed-types.c b/interface/ceed-types.c
index 564a5b009a..cbec562cff 100644
--- a/interface/ceed-types.c
+++ b/interface/ceed-types.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c
index 39b72c770f..eb9f4fc85b 100644
--- a/interface/ceed-vector.c
+++ b/interface/ceed-vector.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -33,6 +33,43 @@ const CeedVector CEED_VECTOR_NONE = &ceed_vector_none;
 
 /// @}
 
+/// ----------------------------------------------------------------------------
+/// CeedVector Internal Functions
+/// ----------------------------------------------------------------------------
+/// @addtogroup CeedVectorDeveloper
+/// @{
+
+/**
+  @brief View a `CeedVector` passed as a `CeedObject`
+
+  @param[in] vec    `CeedVector` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedVectorView_Object(CeedObject vec, FILE *stream) {
+  CeedCall(CeedVectorView((CeedVector)vec, "%12.8f", stream));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Destroy a `CeedVector` passed as a `CeedObject`
+
+  @param[in,out] vec Address of `CeedVector` to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedVectorDestroy_Object(CeedObject *vec) {
+  CeedCall(CeedVectorDestroy((CeedVector *)vec));
+  return CEED_ERROR_SUCCESS;
+}
+
+/// @}
+
 /// ----------------------------------------------------------------------------
 /// CeedVector Backend API
 /// ----------------------------------------------------------------------------
@@ -135,7 +172,7 @@ int CeedVectorSetData(CeedVector vec, void *data) {
   @ref Backend
 **/
 int CeedVectorReference(CeedVector vec) {
-  vec->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)vec));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -159,20 +196,21 @@ int CeedVectorReference(CeedVector vec) {
   @ref User
 **/
 int CeedVectorCreate(Ceed ceed, CeedSize length, CeedVector *vec) {
+  CeedCheck(length >= 0, ceed, CEED_ERROR_UNSUPPORTED, "CeedVector must have length >= 0, received %" CeedSize_FMT, length);
   if (!ceed->VectorCreate) {
     Ceed delegate;
 
     CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
-    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support VectorCreate");
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
     CeedCall(CeedVectorCreate(delegate, length, vec));
+    CeedCall(CeedDestroy(&delegate));
     return CEED_ERROR_SUCCESS;
   }
 
   CeedCall(CeedCalloc(1, vec));
-  CeedCall(CeedReferenceCopy(ceed, &(*vec)->ceed));
-  (*vec)->ref_count = 1;
-  (*vec)->length    = length;
-  (*vec)->state     = 0;
+  CeedCall(CeedObjectCreate(ceed, CeedVectorView_Object, CeedVectorDestroy_Object, &(*vec)->obj));
+  (*vec)->length = length;
+  (*vec)->state  = 0;
   CeedCall(ceed->VectorCreate(length, *vec));
   return CEED_ERROR_SUCCESS;
 }
@@ -202,34 +240,42 @@ int CeedVectorReferenceCopy(CeedVector vec, CeedVector *vec_copy) {
 /**
   @brief Copy a `CeedVector` into a different `CeedVector`.
 
-  Both pointers should be destroyed with @ref CeedVectorDestroy().
-
-  Note: If `*vec_copy` is non-`NULL`, then it is assumed that `*vec_copy` is a pointer to a `CeedVector`.
-        This `CeedVector` will be destroyed if `*vec_copy` is the only reference to this `CeedVector`.
-
   @param[in]     vec      `CeedVector` to copy
-  @param[in,out] vec_copy Variable to store copied `CeedVector` to
+  @param[in,out] vec_copy `CeedVector` to copy array into
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref User
 **/
 int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) {
-  Ceed        ceed;
   CeedMemType mem_type, mem_type_copy;
   CeedScalar *array;
 
-  // Get the preferred memory type
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCall(CeedGetPreferredMemType(ceed, &mem_type));
+  // Get the preferred memory types
+  {
+    Ceed ceed;
 
-  // Get the preferred memory type
-  CeedCall(CeedVectorGetCeed(vec_copy, &ceed));
-  CeedCall(CeedGetPreferredMemType(ceed, &mem_type_copy));
+    CeedCall(CeedVectorGetCeed(vec, &ceed));
+    CeedCall(CeedGetPreferredMemType(ceed, &mem_type));
+    CeedCall(CeedDestroy(&ceed));
+
+    CeedCall(CeedVectorGetCeed(vec_copy, &ceed));
+    CeedCall(CeedGetPreferredMemType(ceed, &mem_type_copy));
+    CeedCall(CeedDestroy(&ceed));
+  }
 
   // Check that both have same memory type
   if (mem_type != mem_type_copy) mem_type = CEED_MEM_HOST;
 
+  // Check compatible lengths
+  {
+    CeedSize length_vec, length_copy;
+
+    CeedCall(CeedVectorGetLength(vec, &length_vec));
+    CeedCall(CeedVectorGetLength(vec_copy, &length_copy));
+    CeedCheck(length_vec == length_copy, CeedVectorReturnCeed(vec), CEED_ERROR_INCOMPATIBLE, "CeedVectors must have the same length to copy");
+  }
+
   // Copy the values from vec to vec_copy
   CeedCall(CeedVectorGetArray(vec, mem_type, &array));
   CeedCall(CeedVectorSetArray(vec_copy, mem_type, CEED_COPY_VALUES, array));
@@ -238,6 +284,57 @@ int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Copy a strided portion of `CeedVector` contents into a different `CeedVector`
+
+  @param[in]     vec      `CeedVector` to copy
+  @param[in]     start    First index to copy in the range `[start, stop)`
+  @param[in]     stop     One past the last element to copy in the range, or `-1` for `length`
+  @param[in]     step     Stride between indices to copy
+  @param[in,out] vec_copy `CeedVector` to copy values to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedVectorCopyStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedVector vec_copy) {
+  CeedSize          length;
+  const CeedScalar *array      = NULL;
+  CeedScalar       *array_copy = NULL;
+
+  // Check length
+  {
+    CeedSize length_vec, length_copy;
+
+    CeedCall(CeedVectorGetLength(vec, &length_vec));
+    CeedCall(CeedVectorGetLength(vec_copy, &length_copy));
+    if (length_vec <= 0 || length_copy <= 0) return CEED_ERROR_SUCCESS;
+    length = length_vec < length_copy ? length_vec : length_copy;
+  }
+  CeedCheck(stop >= -1 && stop <= length, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Invalid value for stop %" CeedSize_FMT ", must be in the range [-1, length]", stop);
+  CeedCheck(start >= 0 && start <= length && (start <= stop || stop == -1), CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Invalid value for start %" CeedSize_FMT ", must be in the range [0, stop]", start);
+
+  // Backend version
+  if (vec->CopyStrided && vec_copy->CopyStrided) {
+    CeedCall(vec->CopyStrided(vec, start, stop, step, vec_copy));
+    vec_copy->state += 2;
+    return CEED_ERROR_SUCCESS;
+  }
+
+  // Copy
+  CeedCall(CeedVectorGetArrayRead(vec, CEED_MEM_HOST, &array));
+  CeedCall(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &array_copy));
+  if (stop == -1) stop = length;
+  for (CeedSize i = start; i < stop; i += step) array_copy[i] = array[i];
+
+  // Cleanup
+  CeedCall(CeedVectorRestoreArrayRead(vec, &array));
+  CeedCall(CeedVectorRestoreArray(vec_copy, &array_copy));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Set the array used by a `CeedVector`, freeing any previously allocated array if applicable.
 
@@ -255,13 +352,11 @@ int CeedVectorCopy(CeedVector vec, CeedVector vec_copy) {
 **/
 int CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array) {
   CeedSize length;
-  Ceed     ceed;
-
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
 
-  CeedCheck(vec->SetArray, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support VectorSetArray");
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use");
-  CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
+  CeedCheck(vec->SetArray, CeedVectorReturnCeed(vec), CEED_ERROR_UNSUPPORTED, "Backend does not support VectorSetArray");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Cannot grant CeedVector array access, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
 
   CeedCall(CeedVectorGetLength(vec, &length));
   if (length > 0) CeedCall(vec->SetArray(vec, mem_type, copy_mode, array));
@@ -280,14 +375,13 @@ int CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_m
   @ref User
 **/
 int CeedVectorSetValue(CeedVector vec, CeedScalar value) {
-  Ceed ceed;
-
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use");
-  CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Cannot grant CeedVector array access, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
 
   if (vec->SetValue) {
     CeedCall(vec->SetValue(vec, value));
+    vec->state += 2;
   } else {
     CeedSize    length;
     CeedScalar *array;
@@ -297,7 +391,46 @@ int CeedVectorSetValue(CeedVector vec, CeedScalar value) {
     for (CeedSize i = 0; i < length; i++) array[i] = value;
     CeedCall(CeedVectorRestoreArray(vec, &array));
   }
-  vec->state += 2;
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Set a portion of a `CeedVector` to a constant value.
+
+  Note: The `CeedVector` must already have valid data set via @ref CeedVectorSetArray() or similar.
+
+  @param[in,out] vec   `CeedVector`
+  @param[in]     start First index to set in range `[start, stop)`
+  @param[in]     stop  One past the last element to set in the range, or `-1` for `length`
+  @param[in]     step  Stride between indices to set
+  @param[in]     value Value to be used
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedVectorSetValueStrided(CeedVector vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar value) {
+  CeedSize length;
+
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Cannot grant CeedVector array access, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
+  CeedCall(CeedVectorGetLength(vec, &length));
+  CeedCheck(stop >= -1 && stop <= length, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Invalid value for stop %" CeedSize_FMT ", must be in the range [-1, length]", stop);
+
+  if (vec->SetValueStrided) {
+    CeedCall(vec->SetValueStrided(vec, start, stop, step, value));
+    vec->state += 2;
+  } else {
+    CeedScalar *array;
+
+    if (length <= 0) return CEED_ERROR_SUCCESS;
+    if (stop == -1) stop = length;
+    CeedCall(CeedVectorGetArray(vec, CEED_MEM_HOST, &array));
+    for (CeedSize i = start; i < stop; i += step) array[i] = value;
+    CeedCall(CeedVectorRestoreArray(vec, &array));
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -352,22 +485,20 @@ int CeedVectorSyncArray(CeedVector vec, CeedMemType mem_type) {
 int CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
   CeedSize    length;
   CeedScalar *temp_array = NULL;
-  Ceed        ceed;
 
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot take CeedVector array, the access lock is already in use");
-  CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot take CeedVector array, a process has read access");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot take CeedVector array, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot take CeedVector array, a process has read access");
 
   CeedCall(CeedVectorGetLength(vec, &length));
   if (length > 0) {
     bool has_borrowed_array_of_type = true, has_valid_array = true;
 
     CeedCall(CeedVectorHasBorrowedArrayOfType(vec, mem_type, &has_borrowed_array_of_type));
-    CeedCheck(has_borrowed_array_of_type, ceed, CEED_ERROR_BACKEND, "CeedVector has no borrowed %s array, must set array with CeedVectorSetArray",
-              CeedMemTypes[mem_type]);
+    CeedCheck(has_borrowed_array_of_type, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
+              "CeedVector has no borrowed %s array, must set array with CeedVectorSetArray", CeedMemTypes[mem_type]);
 
     CeedCall(CeedVectorHasValidArray(vec, &has_valid_array));
-    CeedCheck(has_valid_array, ceed, CEED_ERROR_BACKEND,
+    CeedCheck(has_valid_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
               "CeedVector has no valid data to take, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
     CeedCall(vec->TakeArray(vec, mem_type, &temp_array));
@@ -395,19 +526,18 @@ int CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array
 **/
 int CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
   CeedSize length;
-  Ceed     ceed;
 
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCheck(vec->GetArray, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support GetArray");
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use");
-  CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
+  CeedCheck(vec->GetArray, CeedVectorReturnCeed(vec), CEED_ERROR_UNSUPPORTED, "Backend does not support GetArray");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Cannot grant CeedVector array access, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
 
   CeedCall(CeedVectorGetLength(vec, &length));
   if (length > 0) {
     bool has_valid_array = true;
 
     CeedCall(CeedVectorHasValidArray(vec, &has_valid_array));
-    CeedCheck(has_valid_array, ceed, CEED_ERROR_BACKEND,
+    CeedCheck(has_valid_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
               "CeedVector has no valid data to read, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
     CeedCall(vec->GetArray(vec, mem_type, array));
@@ -434,18 +564,17 @@ int CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array)
 **/
 int CeedVectorGetArrayRead(CeedVector vec, CeedMemType mem_type, const CeedScalar **array) {
   CeedSize length;
-  Ceed     ceed;
 
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCheck(vec->GetArrayRead, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support GetArrayRead");
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector read-only array access, the access lock is already in use");
+  CeedCheck(vec->GetArrayRead, CeedVectorReturnCeed(vec), CEED_ERROR_UNSUPPORTED, "Backend does not support GetArrayRead");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Cannot grant CeedVector read-only array access, the access lock is already in use");
 
   CeedCall(CeedVectorGetLength(vec, &length));
   if (length > 0) {
     bool has_valid_array = true;
 
     CeedCall(CeedVectorHasValidArray(vec, &has_valid_array));
-    CeedCheck(has_valid_array, ceed, CEED_ERROR_BACKEND,
+    CeedCheck(has_valid_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
               "CeedVector has no valid data to read, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
     CeedCall(vec->GetArrayRead(vec, mem_type, array));
@@ -472,12 +601,11 @@ int CeedVectorGetArrayRead(CeedVector vec, CeedMemType mem_type, const CeedScala
 **/
 int CeedVectorGetArrayWrite(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
   CeedSize length;
-  Ceed     ceed;
 
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
-  CeedCheck(vec->GetArrayWrite, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedVectorGetArrayWrite");
-  CeedCheck(vec->state % 2 == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use");
-  CeedCheck(vec->num_readers == 0, ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
+  CeedCheck(vec->GetArrayWrite, CeedVectorReturnCeed(vec), CEED_ERROR_UNSUPPORTED, "Backend does not support CeedVectorGetArrayWrite");
+  CeedCheck(vec->state % 2 == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS,
+            "Cannot grant CeedVector array access, the access lock is already in use");
+  CeedCheck(vec->num_readers == 0, CeedVectorReturnCeed(vec), CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access");
 
   CeedCall(CeedVectorGetLength(vec, &length));
   if (length > 0) {
@@ -644,24 +772,36 @@ int CeedVectorAXPY(CeedVector y, CeedScalar alpha, CeedVector x) {
   CeedSize          length_x, length_y;
   CeedScalar       *y_array = NULL;
   CeedScalar const *x_array = NULL;
-  Ceed              ceed, ceed_parent_x, ceed_parent_y;
 
-  CeedCall(CeedVectorGetCeed(y, &ceed));
   CeedCall(CeedVectorGetLength(y, &length_y));
   CeedCall(CeedVectorGetLength(x, &length_x));
-  CeedCheck(length_x == length_y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot add vector of different lengths");
-  CeedCheck(x != y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPY");
+  CeedCheck(length_x == length_y, CeedVectorReturnCeed(y), CEED_ERROR_UNSUPPORTED,
+            "Cannot add vector of different lengths."
+            " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT,
+            length_x, length_y);
+  CeedCheck(x != y, CeedVectorReturnCeed(y), CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPY");
 
   CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x));
-  CeedCheck(has_valid_array_x, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array_x, CeedVectorReturnCeed(y), CEED_ERROR_BACKEND,
             "CeedVector x has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray");
   CeedCall(CeedVectorHasValidArray(y, &has_valid_array_y));
-  CeedCheck(has_valid_array_y, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array_y, CeedVectorReturnCeed(y), CEED_ERROR_BACKEND,
             "CeedVector y has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
-  CeedCall(CeedGetParent(x->ceed, &ceed_parent_x));
-  CeedCall(CeedGetParent(y->ceed, &ceed_parent_y));
-  CeedCheck(ceed_parent_x == ceed_parent_y, ceed, CEED_ERROR_INCOMPATIBLE, "Vectors x and y must be created by the same Ceed context");
+  {
+    Ceed ceed_x, ceed_y, ceed_parent_x, ceed_parent_y;
+
+    CeedCall(CeedVectorGetCeed(y, &ceed_y));
+    CeedCall(CeedVectorGetCeed(x, &ceed_x));
+    CeedCall(CeedGetParent(ceed_x, &ceed_parent_x));
+    CeedCall(CeedGetParent(ceed_y, &ceed_parent_y));
+    CeedCall(CeedDestroy(&ceed_x));
+    CeedCall(CeedDestroy(&ceed_y));
+    CeedCheck(ceed_parent_x == ceed_parent_y, CeedVectorReturnCeed(y), CEED_ERROR_INCOMPATIBLE,
+              "Vectors x and y must be created by the same Ceed context");
+    CeedCall(CeedDestroy(&ceed_parent_x));
+    CeedCall(CeedDestroy(&ceed_parent_y));
+  }
 
   // Return early for empty vectors
   if (length_y == 0) return CEED_ERROR_SUCCESS;
@@ -703,25 +843,36 @@ int CeedVectorAXPBY(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector
   CeedSize          length_x, length_y;
   CeedScalar       *y_array = NULL;
   CeedScalar const *x_array = NULL;
-  Ceed              ceed, ceed_parent_x, ceed_parent_y;
-
-  CeedCall(CeedVectorGetCeed(y, &ceed));
 
   CeedCall(CeedVectorGetLength(y, &length_y));
   CeedCall(CeedVectorGetLength(x, &length_x));
-  CeedCheck(length_x == length_y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot add vector of different lengths");
-  CeedCheck(x != y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPBY");
+  CeedCheck(length_x == length_y, CeedVectorReturnCeed(y), CEED_ERROR_UNSUPPORTED,
+            "Cannot add vector of different lengths."
+            " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT,
+            length_x, length_y);
+  CeedCheck(x != y, CeedVectorReturnCeed(y), CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPBY");
 
   CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x));
-  CeedCheck(has_valid_array_x, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array_x, CeedVectorReturnCeed(y), CEED_ERROR_BACKEND,
             "CeedVector x has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray");
   CeedCall(CeedVectorHasValidArray(y, &has_valid_array_y));
-  CeedCheck(has_valid_array_y, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array_y, CeedVectorReturnCeed(y), CEED_ERROR_BACKEND,
             "CeedVector y has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
-  CeedCall(CeedGetParent(x->ceed, &ceed_parent_x));
-  CeedCall(CeedGetParent(y->ceed, &ceed_parent_y));
-  CeedCheck(ceed_parent_x == ceed_parent_y, ceed, CEED_ERROR_INCOMPATIBLE, "Vectors x and y must be created by the same Ceed context");
+  {
+    Ceed ceed_x, ceed_y, ceed_parent_x, ceed_parent_y;
+
+    CeedCall(CeedVectorGetCeed(y, &ceed_y));
+    CeedCall(CeedVectorGetCeed(x, &ceed_x));
+    CeedCall(CeedGetParent(ceed_x, &ceed_parent_x));
+    CeedCall(CeedGetParent(ceed_y, &ceed_parent_y));
+    CeedCall(CeedDestroy(&ceed_x));
+    CeedCall(CeedDestroy(&ceed_y));
+    CeedCheck(ceed_parent_x == ceed_parent_y, CeedVectorReturnCeed(y), CEED_ERROR_INCOMPATIBLE,
+              "Vectors x and y must be created by the same Ceed context");
+    CeedCall(CeedDestroy(&ceed_parent_x));
+    CeedCall(CeedDestroy(&ceed_parent_y));
+  }
 
   // Return early for empty vectors
   if (length_y == 0) return CEED_ERROR_SUCCESS;
@@ -764,25 +915,39 @@ int CeedVectorPointwiseMult(CeedVector w, CeedVector x, CeedVector y) {
   CeedScalar       *w_array = NULL;
   CeedScalar const *x_array = NULL, *y_array = NULL;
   CeedSize          length_w, length_x, length_y;
-  Ceed              ceed, ceed_parent_w, ceed_parent_x, ceed_parent_y;
 
-  CeedCall(CeedVectorGetCeed(w, &ceed));
   CeedCall(CeedVectorGetLength(w, &length_w));
   CeedCall(CeedVectorGetLength(x, &length_x));
   CeedCall(CeedVectorGetLength(y, &length_y));
-  CeedCheck(length_w == length_x && length_w == length_y, ceed, CEED_ERROR_UNSUPPORTED, "Cannot multiply vectors of different lengths");
-
-  CeedCall(CeedGetParent(w->ceed, &ceed_parent_w));
-  CeedCall(CeedGetParent(x->ceed, &ceed_parent_x));
-  CeedCall(CeedGetParent(y->ceed, &ceed_parent_y));
-  CeedCheck(ceed_parent_w == ceed_parent_x && ceed_parent_w == ceed_parent_y, ceed, CEED_ERROR_INCOMPATIBLE,
-            "Vectors w, x, and y must be created by the same Ceed context");
+  CeedCheck(length_x >= length_w && length_y >= length_w, CeedVectorReturnCeed(w), CEED_ERROR_UNSUPPORTED,
+            "Cannot pointwise multiply vectors of incompatible lengths."
+            " w length: %" CeedSize_FMT " x length: %" CeedSize_FMT " y length: %" CeedSize_FMT,
+            length_w, length_x, length_y);
+
+  {
+    Ceed ceed_w, ceed_x, ceed_y, ceed_parent_w, ceed_parent_x, ceed_parent_y;
+
+    CeedCall(CeedVectorGetCeed(w, &ceed_w));
+    CeedCall(CeedVectorGetCeed(x, &ceed_x));
+    CeedCall(CeedVectorGetCeed(y, &ceed_y));
+    CeedCall(CeedGetParent(ceed_w, &ceed_parent_w));
+    CeedCall(CeedGetParent(ceed_x, &ceed_parent_x));
+    CeedCall(CeedGetParent(ceed_y, &ceed_parent_y));
+    CeedCall(CeedDestroy(&ceed_w));
+    CeedCall(CeedDestroy(&ceed_x));
+    CeedCall(CeedDestroy(&ceed_y));
+    CeedCheck(ceed_parent_w == ceed_parent_x && ceed_parent_w == ceed_parent_y, CeedVectorReturnCeed(w), CEED_ERROR_INCOMPATIBLE,
+              "Vectors w, x, and y must be created by the same Ceed context");
+    CeedCall(CeedDestroy(&ceed_parent_w));
+    CeedCall(CeedDestroy(&ceed_parent_x));
+    CeedCall(CeedDestroy(&ceed_parent_y));
+  }
 
   CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x));
-  CeedCheck(has_valid_array_x, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array_x, CeedVectorReturnCeed(w), CEED_ERROR_BACKEND,
             "CeedVector x has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray");
   CeedCall(CeedVectorHasValidArray(y, &has_valid_array_y));
-  CeedCheck(has_valid_array_y, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array_y, CeedVectorReturnCeed(w), CEED_ERROR_BACKEND,
             "CeedVector y has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
   // Return early for empty vectors
@@ -838,15 +1003,13 @@ int CeedVectorReciprocal(CeedVector vec) {
   bool        has_valid_array = true;
   CeedSize    length;
   CeedScalar *array;
-  Ceed        ceed;
 
-  CeedCall(CeedVectorGetCeed(vec, &ceed));
   CeedCall(CeedVectorHasValidArray(vec, &has_valid_array));
-  CeedCheck(has_valid_array, ceed, CEED_ERROR_BACKEND,
+  CeedCheck(has_valid_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND,
             "CeedVector has no valid data to compute reciprocal, must set data with CeedVectorSetValue or CeedVectorSetArray");
 
   // Check if vector data set
-  CeedCheck(vec->state > 0, ceed, CEED_ERROR_INCOMPLETE, "CeedVector must have data set to take reciprocal");
+  CeedCheck(vec->state > 0, CeedVectorReturnCeed(vec), CEED_ERROR_INCOMPLETE, "CeedVector must have data set to take reciprocal");
 
   // Return early for empty vector
   CeedCall(CeedVectorGetLength(vec, &length));
@@ -867,6 +1030,36 @@ int CeedVectorReciprocal(CeedVector vec) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the number of tabs to indent for @ref CeedVectorView() output
+
+  @param[in] vec      `CeedVector` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedVectorSetNumViewTabs(CeedVector vec, CeedInt num_tabs) {
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)vec, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get the number of tabs to indent for @ref CeedVectorView() output
+
+  @param[in]  vec      `CeedVector` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedVectorGetNumViewTabs(CeedVector vec, CeedInt *num_tabs) {
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)vec, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief View a `CeedVector`
 
@@ -874,8 +1067,8 @@ int CeedVectorReciprocal(CeedVector vec) {
         Any portion of the provided range that is outside the range of valid indices for the `CeedVector` will be ignored.
 
   @param[in] vec    `CeedVector` to view
-  @param[in] start  Index of first `CeedVector` entry to view
-  @param[in] stop   Index of last `CeedVector` entry to view
+  @param[in] start  Index of first `CeedVector` entry to view in the range `[start, stop)`
+  @param[in] stop   One past the last element to view in the range, or `-1` for `length`
   @param[in] step   Step between `CeedVector` entries to view
   @param[in] fp_fmt Printing format
   @param[in] stream Filestream to write to
@@ -886,24 +1079,34 @@ int CeedVectorReciprocal(CeedVector vec) {
 **/
 int CeedVectorViewRange(CeedVector vec, CeedSize start, CeedSize stop, CeedInt step, const char *fp_fmt, FILE *stream) {
   char              fmt[1024];
+  char             *tabs = NULL;
   CeedSize          length;
   const CeedScalar *x;
 
   CeedCheck(step != 0, CeedVectorReturnCeed(vec), CEED_ERROR_MINOR, "View range 'step' must be nonzero");
 
+  {
+    CeedInt num_tabs = 0;
+
+    CeedCall(CeedVectorGetNumViewTabs(vec, &num_tabs));
+    CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
+  }
+
   CeedCall(CeedVectorGetLength(vec, &length));
-  fprintf(stream, "CeedVector length %" CeedSize_FMT "\n", length);
+  fprintf(stream, "%sCeedVector length %" CeedSize_FMT "\n", tabs, length);
   if (start != 0 || stop != length || step != 1) {
-    fprintf(stream, "  start: %" CeedSize_FMT "\n  stop:  %" CeedSize_FMT "\n  step:  %" CeedInt_FMT "\n", start, stop, step);
+    fprintf(stream, "%s  start: %" CeedSize_FMT "\n%s  stop:  %" CeedSize_FMT "\n%s  step:  %" CeedInt_FMT "\n", tabs, start, tabs, stop, tabs, step);
   }
   if (start > length) start = length;
-  if (stop > length) stop = length;
+  if (stop == -1 || stop > length) stop = length;
 
-  snprintf(fmt, sizeof fmt, "  %s\n", fp_fmt ? fp_fmt : "%g");
+  snprintf(fmt, sizeof fmt, "%s  %s\n", tabs, fp_fmt ? fp_fmt : "%g");
   CeedCall(CeedVectorGetArrayRead(vec, CEED_MEM_HOST, &x));
   for (CeedSize i = start; step > 0 ? (i < stop) : (i > stop); i += step) fprintf(stream, fmt, x[i]);
   CeedCall(CeedVectorRestoreArrayRead(vec, &x));
-  if (stop != length) fprintf(stream, "  ...\n");
+  if (stop != length) fprintf(stream, "%s  ...\n", tabs);
+  CeedCall(CeedFree(&tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -937,7 +1140,7 @@ int CeedVectorView(CeedVector vec, const char *fp_fmt, FILE *stream) {
   @ref Advanced
 **/
 int CeedVectorGetCeed(CeedVector vec, Ceed *ceed) {
-  *ceed = CeedVectorReturnCeed(vec);
+  CeedCall(CeedObjectGetCeed((CeedObject)vec, ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -950,7 +1153,7 @@ int CeedVectorGetCeed(CeedVector vec, Ceed *ceed) {
 
   @ref Advanced
 **/
-Ceed CeedVectorReturnCeed(CeedVector vec) { return vec->ceed; }
+Ceed CeedVectorReturnCeed(CeedVector vec) { return CeedObjectReturnCeed((CeedObject)vec); }
 
 /**
   @brief Get the length of a `CeedVector`
@@ -977,16 +1180,15 @@ int CeedVectorGetLength(CeedVector vec, CeedSize *length) {
   @ref User
 **/
 int CeedVectorDestroy(CeedVector *vec) {
-  if (!*vec || *vec == CEED_VECTOR_ACTIVE || *vec == CEED_VECTOR_NONE || --(*vec)->ref_count > 0) {
+  if (!*vec || *vec == CEED_VECTOR_ACTIVE || *vec == CEED_VECTOR_NONE || CeedObjectDereference((CeedObject)*vec) > 0) {
     *vec = NULL;
     return CEED_ERROR_SUCCESS;
   }
-  CeedCheck((*vec)->state % 2 == 0, (*vec)->ceed, CEED_ERROR_ACCESS, "Cannot destroy CeedVector, the writable access lock is in use");
-  CeedCheck((*vec)->num_readers == 0, (*vec)->ceed, CEED_ERROR_ACCESS, "Cannot destroy CeedVector, a process has read access");
+  CeedCheck((*vec)->state % 2 == 0, CeedVectorReturnCeed(*vec), CEED_ERROR_ACCESS, "Cannot destroy CeedVector, the writable access lock is in use");
+  CeedCheck((*vec)->num_readers == 0, CeedVectorReturnCeed(*vec), CEED_ERROR_ACCESS, "Cannot destroy CeedVector, a process has read access");
 
   if ((*vec)->Destroy) CeedCall((*vec)->Destroy(*vec));
-
-  CeedCall(CeedDestroy(&(*vec)->ceed));
+  CeedCall(CeedObjectDestroy_Private(&(*vec)->obj));
   CeedCall(CeedFree(vec));
   return CEED_ERROR_SUCCESS;
 }
diff --git a/interface/ceed.c b/interface/ceed.c
index ad7f09fa8e..6c94ec8db8 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -28,8 +28,7 @@ static struct {
 } backends[32];
 static size_t num_backends;
 
-#define CEED_FTABLE_ENTRY(class, method) \
-  { #class #method, offsetof(struct class##_private, method) }
+#define CEED_FTABLE_ENTRY(class, method) {#class #method, offsetof(struct class##_private, method)}
 /// @endcond
 
 /// @file
@@ -139,6 +138,75 @@ int CeedRegisterImpl(const char *prefix, int (*init)(const char *, Ceed), unsign
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Create a work vector space for a `ceed`
+
+  @param[in,out] ceed `Ceed` to create work vector space for
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedWorkVectorsCreate(Ceed ceed) {
+  CeedCall(CeedCalloc(1, &ceed->work_vectors));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Destroy a work vector space for a `ceed`
+
+  @param[in,out] ceed `Ceed` to destroy work vector space for
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedWorkVectorsDestroy(Ceed ceed) {
+  if (!ceed->work_vectors) return CEED_ERROR_SUCCESS;
+  for (CeedSize i = 0; i < ceed->work_vectors->num_vecs; i++) {
+    CeedCheck(!ceed->work_vectors->is_in_use[i], ceed, CEED_ERROR_ACCESS, "Work vector %" CeedSize_FMT " checked out but not returned");
+    // Note: increase ref_count to prevent Ceed destructor from triggering again
+    CeedCall(CeedObjectReference((CeedObject)ceed));
+    CeedCall(CeedObjectReference((CeedObject)ceed));
+    CeedCall(CeedVectorDestroy(&ceed->work_vectors->vecs[i]));
+    // Note: restore ref_count
+    CeedObjectDereference((CeedObject)ceed);
+  }
+  CeedCall(CeedFree(&ceed->work_vectors->is_in_use));
+  CeedCall(CeedFree(&ceed->work_vectors->vecs));
+  CeedCall(CeedFree(&ceed->work_vectors));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief View a `Ceed` passed as a `CeedObject`
+
+  @param[in] ceed   `Ceed` to view
+  @param[in] stream Filestream to write to
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedView_Object(CeedObject ceed, FILE *stream) {
+  CeedCall(CeedView((Ceed)ceed, stream));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Destroy a `Ceed` passed as a `CeedObject`
+
+  @param[in,out] ceed Address of `Ceed` context to destroy
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+static int CeedDestroy_Object(CeedObject *ceed) {
+  CeedCall(CeedDestroy((Ceed *)ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
 /// @}
 
 /// ----------------------------------------------------------------------------
@@ -313,10 +381,15 @@ static inline int CeedSetHostGenericArray(const void *source_array, CeedCopyMode
                                           void *target_array_owned, void *target_array_borrowed, void *target_array) {
   switch (copy_mode) {
     case CEED_COPY_VALUES:
-      if (!*(void **)target_array_owned) CeedCall(CeedCallocArray(num_values, size_unit, target_array_owned));
-      if (source_array) memcpy(*(void **)target_array_owned, source_array, size_unit * num_values);
-      *(void **)target_array_borrowed = NULL;
-      *(void **)target_array          = *(void **)target_array_owned;
+      if (!*(void **)target_array) {
+        if (*(void **)target_array_borrowed) {
+          *(void **)target_array = *(void **)target_array_borrowed;
+        } else {
+          if (!*(void **)target_array_owned) CeedCall(CeedCallocArray(num_values, size_unit, target_array_owned));
+          *(void **)target_array = *(void **)target_array_owned;
+        }
+      }
+      if (source_array) memcpy(*(void **)target_array, source_array, size_unit * num_values);
       break;
     case CEED_OWN_POINTER:
       CeedCall(CeedFree(target_array_owned));
@@ -444,7 +517,9 @@ int CeedIsDebug(Ceed ceed, bool *is_debug) {
 }
 
 /**
-  @brief Get the root of the requested resource
+  @brief Get the root of the requested resource.
+
+  Note: Caller is responsible for calling @ref CeedFree() on the `resource_root`.
 
   @param[in]  ceed          `Ceed` context to get resource name of
   @param[in]  resource      Full user specified resource
@@ -479,7 +554,8 @@ int CeedGetParent(Ceed ceed, Ceed *parent) {
     CeedCall(CeedGetParent(ceed->parent, parent));
     return CEED_ERROR_SUCCESS;
   }
-  *parent = ceed;
+  *parent = NULL;
+  CeedCall(CeedReferenceCopy(ceed, parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -494,7 +570,8 @@ int CeedGetParent(Ceed ceed, Ceed *parent) {
   @ref Backend
 **/
 int CeedGetDelegate(Ceed ceed, Ceed *delegate) {
-  *delegate = ceed->delegate;
+  *delegate = NULL;
+  if (ceed->delegate) CeedCall(CeedReferenceCopy(ceed->delegate, delegate));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -512,7 +589,7 @@ int CeedGetDelegate(Ceed ceed, Ceed *delegate) {
   @ref Backend
 **/
 int CeedSetDelegate(Ceed ceed, Ceed delegate) {
-  ceed->delegate   = delegate;
+  CeedCall(CeedReferenceCopy(delegate, &ceed->delegate));
   delegate->parent = ceed;
   return CEED_ERROR_SUCCESS;
 }
@@ -532,7 +609,8 @@ int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, const char *obj_name) {
   // Check for object delegate
   for (CeedInt i = 0; i < ceed->obj_delegate_count; i++) {
     if (!strcmp(obj_name, ceed->obj_delegates->obj_name)) {
-      *delegate = ceed->obj_delegates->delegate;
+      *delegate = NULL;
+      CeedCall(CeedReferenceCopy(ceed->obj_delegates->delegate, delegate));
       return CEED_ERROR_SUCCESS;
     }
   }
@@ -569,7 +647,7 @@ int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name) {
   ceed->obj_delegate_count++;
 
   // Set object delegate
-  ceed->obj_delegates[count].delegate = delegate;
+  CeedCall(CeedReferenceCopy(delegate, &ceed->obj_delegates[count].delegate));
   CeedCall(CeedStringAllocCopy(obj_name, &ceed->obj_delegates[count].obj_name));
 
   // Set delegate parent
@@ -577,21 +655,6 @@ int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name) {
   return CEED_ERROR_SUCCESS;
 }
 
-/**
-  @brief Get the fallback resource for `CeedOperator`
-
-  @param[in]  ceed     `Ceed` context
-  @param[out] resource Variable to store fallback resource
-
-  @return An error code: 0 - success, otherwise - failure
-
-  @ref Backend
-**/
-int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource) {
-  *resource = (const char *)ceed->op_fallback_resource;
-  return CEED_ERROR_SUCCESS;
-}
-
 /**
   @brief Get the fallback `Ceed` for `CeedOperator`
 
@@ -603,50 +666,32 @@ int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource) {
   @ref Backend
 **/
 int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) {
-  if (ceed->has_valid_op_fallback_resource) {
-    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- CeedOperator Fallback ----------\n");
-    CeedDebug(ceed, "Getting fallback from %s to %s\n", ceed->resource, ceed->op_fallback_resource);
+  if (ceed->op_fallback_ceed) {
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- Ceed Fallback ----------\n");
+    CeedDebug(ceed, "Falling back from Ceed with backend %s at address %p to Ceed with backend %s at address %p", ceed->resource, ceed,
+              ceed->op_fallback_ceed->resource, ceed->op_fallback_ceed);
   }
 
-  // Create fallback Ceed if uninitalized
-  if (!ceed->op_fallback_ceed && ceed->has_valid_op_fallback_resource) {
-    CeedDebug(ceed, "Creating fallback Ceed");
-
-    Ceed        fallback_ceed;
-    const char *fallback_resource;
-
-    CeedCall(CeedGetOperatorFallbackResource(ceed, &fallback_resource));
-    CeedCall(CeedInit(fallback_resource, &fallback_ceed));
-    fallback_ceed->op_fallback_parent = ceed;
-    fallback_ceed->Error              = ceed->Error;
-    ceed->op_fallback_ceed            = fallback_ceed;
-  }
-  *fallback_ceed = ceed->op_fallback_ceed;
+  *fallback_ceed = NULL;
+  if (ceed->op_fallback_ceed) CeedCall(CeedReferenceCopy(ceed->op_fallback_ceed, fallback_ceed));
   return CEED_ERROR_SUCCESS;
 }
 
 /**
   @brief Set the fallback resource for `CeedOperator`.
 
-  The current resource, if any, is freed by calling this function.
-  This string is freed upon the destruction of the `Ceed` context.
+  The current fallback, if any, is freed by calling this function.
 
-  @param[in,out] ceed     `Ceed` context
-  @param[in]     resource Fallback resource to set
+  @param[in,out] ceed          `Ceed` context
+  @param[in]     fallback_ceed `Ceed` context to create fallback operators
 
   @return An error code: 0 - success, otherwise - failure
 
   @ref Backend
 **/
-int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource) {
-  // Free old
-  CeedCall(CeedFree(&ceed->op_fallback_resource));
-
-  // Set new
-  CeedCall(CeedStringAllocCopy(resource, (char **)&ceed->op_fallback_resource));
-
-  // Check validity
-  ceed->has_valid_op_fallback_resource = ceed->op_fallback_resource && ceed->resource && strcmp(ceed->op_fallback_resource, ceed->resource);
+int CeedSetOperatorFallbackCeed(Ceed ceed, Ceed fallback_ceed) {
+  CeedCall(CeedReferenceCopy(fallback_ceed, &ceed->op_fallback_ceed));
+  fallback_ceed->parent = ceed;
   return CEED_ERROR_SUCCESS;
 }
 
@@ -746,7 +791,317 @@ int CeedSetData(Ceed ceed, void *data) {
   @ref Backend
 **/
 int CeedReference(Ceed ceed) {
-  ceed->ref_count++;
+  CeedCall(CeedObjectReference((CeedObject)ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Computes the current memory usage of the work vectors in a `Ceed` context and prints to debug.abort
+
+  @param[in]  ceed     `Ceed` context
+  @param[out] usage_mb Address of the variable where the MB of work vector usage will be stored
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Developer
+**/
+int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb) {
+  if (!ceed->VectorCreate) {
+    Ceed delegate;
+
+    CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
+    CeedCall(CeedGetWorkVectorMemoryUsage(delegate, usage_mb));
+    CeedCall(CeedDestroy(&delegate));
+    return CEED_ERROR_SUCCESS;
+  }
+  *usage_mb = 0.0;
+  if (ceed->work_vectors) {
+    for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
+      CeedSize vec_len;
+      CeedCall(CeedVectorGetLength(ceed->work_vectors->vecs[i], &vec_len));
+      *usage_mb += vec_len;
+    }
+    *usage_mb *= sizeof(CeedScalar) * 1e-6;
+    CeedDebug(ceed, "Resource {%s}: Work vectors memory usage: %" CeedInt_FMT " vectors, %g MB\n", ceed->resource, ceed->work_vectors->num_vecs,
+              *usage_mb);
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Clear inactive work vectors in a `Ceed` context below a minimum length.
+
+  @param[in,out] ceed    `Ceed` context
+  @param[in]     min_len Minimum length of work vector to keep
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedClearWorkVectors(Ceed ceed, CeedSize min_len) {
+  if (!ceed->VectorCreate) {
+    Ceed delegate;
+
+    CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
+    CeedCall(CeedClearWorkVectors(delegate, min_len));
+    CeedCall(CeedDestroy(&delegate));
+    return CEED_ERROR_SUCCESS;
+  }
+  if (!ceed->work_vectors) return CEED_ERROR_SUCCESS;
+  for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
+    if (ceed->work_vectors->is_in_use[i]) continue;
+    CeedSize vec_len;
+    CeedCall(CeedVectorGetLength(ceed->work_vectors->vecs[i], &vec_len));
+    if (vec_len < min_len) {
+      // Note: increase ref_count to prevent Ceed destructor from triggering
+      CeedCall(CeedObjectReference((CeedObject)ceed));
+      CeedCall(CeedObjectReference((CeedObject)ceed));
+      CeedCall(CeedVectorDestroy(&ceed->work_vectors->vecs[i]));
+      // Note: restore ref_count
+      CeedObjectDereference((CeedObject)ceed);
+      ceed->work_vectors->num_vecs--;
+      if (ceed->work_vectors->num_vecs > 0) {
+        ceed->work_vectors->vecs[i]                                 = ceed->work_vectors->vecs[ceed->work_vectors->num_vecs];
+        ceed->work_vectors->is_in_use[i]                            = ceed->work_vectors->is_in_use[ceed->work_vectors->num_vecs];
+        ceed->work_vectors->is_in_use[ceed->work_vectors->num_vecs] = false;
+        i--;
+      }
+    }
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get a `CeedVector` for scratch work from a `Ceed` context.
+
+  Note: This vector must be restored with @ref CeedRestoreWorkVector().
+
+  @param[in]  ceed `Ceed` context
+  @param[in]  len  Minimum length of work vector
+  @param[out] vec  Address of the variable where `CeedVector` will be stored
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) {
+  CeedInt    i = 0;
+  CeedScalar usage_mb;
+
+  if (!ceed->VectorCreate) {
+    Ceed delegate;
+
+    CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
+    CeedCall(CeedGetWorkVector(delegate, len, vec));
+    CeedCall(CeedDestroy(&delegate));
+    return CEED_ERROR_SUCCESS;
+  }
+
+  if (!ceed->work_vectors) CeedCall(CeedWorkVectorsCreate(ceed));
+
+  // Search for big enough work vector
+  for (i = 0; i < ceed->work_vectors->num_vecs; i++) {
+    if (!ceed->work_vectors->is_in_use[i]) {
+      CeedSize work_len;
+
+      CeedCall(CeedVectorGetLength(ceed->work_vectors->vecs[i], &work_len));
+      if (work_len >= len) break;
+    }
+  }
+  // Long enough vector was not found
+  if (i == ceed->work_vectors->num_vecs) {
+    if (ceed->work_vectors->max_vecs == 0) {
+      ceed->work_vectors->max_vecs = 1;
+      CeedCall(CeedCalloc(ceed->work_vectors->max_vecs, &ceed->work_vectors->vecs));
+      CeedCall(CeedCalloc(ceed->work_vectors->max_vecs, &ceed->work_vectors->is_in_use));
+    } else if (ceed->work_vectors->max_vecs == i) {
+      ceed->work_vectors->max_vecs *= 2;
+      CeedCall(CeedRealloc(ceed->work_vectors->max_vecs, &ceed->work_vectors->vecs));
+      CeedCall(CeedRealloc(ceed->work_vectors->max_vecs, &ceed->work_vectors->is_in_use));
+    }
+    ceed->work_vectors->num_vecs++;
+    CeedCallBackend(CeedVectorCreate(ceed, len, &ceed->work_vectors->vecs[i]));
+    // Note: ref_count manipulation to prevent a ref-loop
+    CeedObjectDereference((CeedObject)ceed);
+    if (ceed->is_debug) CeedGetWorkVectorMemoryUsage(ceed, &usage_mb);
+  }
+  // Return pointer to work vector
+  ceed->work_vectors->is_in_use[i] = true;
+  *vec                             = NULL;
+  CeedCall(CeedVectorReferenceCopy(ceed->work_vectors->vecs[i], vec));
+  // Note: bump ref_count to account for external access
+  CeedCall(CeedObjectReference((CeedObject)ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Restore a `CeedVector` for scratch work from a `Ceed` context from @ref CeedGetWorkVector()
+
+  @param[in]  ceed `Ceed` context
+  @param[out] vec  `CeedVector` to restore
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec) {
+  if (!ceed->VectorCreate) {
+    Ceed delegate;
+
+    CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector"));
+    CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate");
+    CeedCall(CeedRestoreWorkVector(delegate, vec));
+    CeedCall(CeedDestroy(&delegate));
+    return CEED_ERROR_SUCCESS;
+  }
+
+  for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) {
+    if (*vec == ceed->work_vectors->vecs[i]) {
+      CeedCheck(ceed->work_vectors->is_in_use[i], ceed, CEED_ERROR_ACCESS, "Work vector %" CeedSize_FMT " was not checked out but is being returned");
+      CeedCall(CeedVectorDestroy(vec));
+      ceed->work_vectors->is_in_use[i] = false;
+      // Note: reduce ref_count again to prevent a ref-loop
+      CeedObjectDereference((CeedObject)ceed);
+      return CEED_ERROR_SUCCESS;
+    }
+  }
+  // LCOV_EXCL_START
+  return CeedError(ceed, CEED_ERROR_MAJOR, "vec was not checked out via CeedGetWorkVector()");
+  // LCOV_EXCL_STOP
+}
+
+/**
+  @brief Retrieve list of additional JiT source roots from `Ceed` context.
+
+  Note: The caller is responsible for restoring `jit_source_roots` with @ref CeedRestoreJitSourceRoots().
+
+  @param[in]  ceed             `Ceed` context
+  @param[out] num_source_roots Number of JiT source directories
+  @param[out] jit_source_roots Absolute paths to additional JiT source directories
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedGetJitSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***jit_source_roots) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  *num_source_roots = ceed_parent->num_jit_source_roots;
+  *jit_source_roots = (const char **)ceed_parent->jit_source_roots;
+  ceed_parent->num_jit_source_roots_readers++;
+  CeedCall(CeedDestroy(&ceed_parent));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Retrieve list of additional Rust source roots from `Ceed` context.
+
+  Note: The caller is responsible for restoring `rust_source_roots` with @ref CeedRestoreRustSourceRoots().
+
+  @param[in]  ceed             `Ceed` context
+  @param[out] num_source_roots Number of JiT source directories
+  @param[out] rust_source_roots Absolute paths to additional Rust source directories
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedGetRustSourceRoots(Ceed ceed, CeedInt *num_source_roots, const char ***rust_source_roots) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  *num_source_roots  = ceed_parent->num_rust_source_roots;
+  *rust_source_roots = (const char **)ceed_parent->rust_source_roots;
+  ceed_parent->num_rust_source_roots_readers++;
+  CeedCall(CeedDestroy(&ceed_parent));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Restore list of additional JiT source roots from with @ref CeedGetJitSourceRoots()
+
+  @param[in]  ceed             `Ceed` context
+  @param[out] jit_source_roots Absolute paths to additional JiT source directories
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedRestoreJitSourceRoots(Ceed ceed, const char ***jit_source_roots) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  *jit_source_roots = NULL;
+  ceed_parent->num_jit_source_roots_readers--;
+  CeedCall(CeedDestroy(&ceed_parent));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Restore list of additional Rust source roots from with @ref CeedGetJitSourceRoots()
+
+  @param[in]  ceed             `Ceed` context
+  @param[out] rust_source_roots Absolute paths to additional Rust source directories
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedRestoreRustSourceRoots(Ceed ceed, const char ***rust_source_roots) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  *rust_source_roots = NULL;
+  ceed_parent->num_rust_source_roots_readers--;
+  CeedCall(CeedDestroy(&ceed_parent));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Retrieve list of additional JiT defines from `Ceed` context.
+
+  Note: The caller is responsible for restoring `jit_defines` with @ref CeedRestoreJitDefines().
+
+  @param[in]  ceed            `Ceed` context
+  @param[out] num_jit_defines Number of JiT defines
+  @param[out] jit_defines     Strings such as `foo=bar`, used as `-Dfoo=bar` in JiT
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedGetJitDefines(Ceed ceed, CeedInt *num_jit_defines, const char ***jit_defines) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  *num_jit_defines = ceed_parent->num_jit_defines;
+  *jit_defines     = (const char **)ceed_parent->jit_defines;
+  ceed_parent->num_jit_defines_readers++;
+  CeedCall(CeedDestroy(&ceed_parent));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Restore list of additional JiT defines from with @ref CeedGetJitDefines()
+
+  @param[in]  ceed        `Ceed` context
+  @param[out] jit_defines String such as `foo=bar`, used as `-Dfoo=bar` in JiT
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref Backend
+**/
+int CeedRestoreJitDefines(Ceed ceed, const char ***jit_defines) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  *jit_defines = NULL;
+  ceed_parent->num_jit_defines_readers--;
+  CeedCall(CeedDestroy(&ceed_parent));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -892,15 +1247,16 @@ int CeedInit(const char *resource, Ceed *ceed) {
 
   // Setup Ceed
   CeedCall(CeedCalloc(1, ceed));
+  CeedCall(CeedObjectCreate(NULL, CeedView_Object, CeedDestroy_Object, &(*ceed)->obj));
   CeedCall(CeedCalloc(1, &(*ceed)->jit_source_roots));
+  CeedCall(CeedCalloc(1, &(*ceed)->rust_source_roots));
   const char *ceed_error_handler = getenv("CEED_ERROR_HANDLER");
   if (!ceed_error_handler) ceed_error_handler = "abort";
   if (!strcmp(ceed_error_handler, "exit")) (*ceed)->Error = CeedErrorExit;
   else if (!strcmp(ceed_error_handler, "store")) (*ceed)->Error = CeedErrorStore;
   else (*ceed)->Error = CeedErrorAbort;
   memcpy((*ceed)->err_msg, "No error message stored", 24);
-  (*ceed)->ref_count = 1;
-  (*ceed)->data      = NULL;
+  (*ceed)->data = NULL;
 
   // Set lookup table
   FOffset f_offsets[] = {
@@ -924,9 +1280,11 @@ int CeedInit(const char *resource, Ceed *ceed) {
       CEED_FTABLE_ENTRY(Ceed, CompositeOperatorCreate),
       CEED_FTABLE_ENTRY(CeedVector, HasValidArray),
       CEED_FTABLE_ENTRY(CeedVector, HasBorrowedArrayOfType),
+      CEED_FTABLE_ENTRY(CeedVector, CopyStrided),
       CEED_FTABLE_ENTRY(CeedVector, SetArray),
       CEED_FTABLE_ENTRY(CeedVector, TakeArray),
       CEED_FTABLE_ENTRY(CeedVector, SetValue),
+      CEED_FTABLE_ENTRY(CeedVector, SetValueStrided),
       CEED_FTABLE_ENTRY(CeedVector, SyncArray),
       CEED_FTABLE_ENTRY(CeedVector, GetArray),
       CEED_FTABLE_ENTRY(CeedVector, GetArrayRead),
@@ -948,9 +1306,12 @@ int CeedInit(const char *resource, Ceed *ceed) {
       CEED_FTABLE_ENTRY(CeedElemRestriction, GetOffsets),
       CEED_FTABLE_ENTRY(CeedElemRestriction, GetOrientations),
       CEED_FTABLE_ENTRY(CeedElemRestriction, GetCurlOrientations),
+      CEED_FTABLE_ENTRY(CeedElemRestriction, GetAtPointsElementOffset),
       CEED_FTABLE_ENTRY(CeedElemRestriction, Destroy),
       CEED_FTABLE_ENTRY(CeedBasis, Apply),
+      CEED_FTABLE_ENTRY(CeedBasis, ApplyAdd),
       CEED_FTABLE_ENTRY(CeedBasis, ApplyAtPoints),
+      CEED_FTABLE_ENTRY(CeedBasis, ApplyAddAtPoints),
       CEED_FTABLE_ENTRY(CeedBasis, Destroy),
       CEED_FTABLE_ENTRY(CeedTensorContract, Apply),
       CEED_FTABLE_ENTRY(CeedTensorContract, Destroy),
@@ -990,10 +1351,6 @@ int CeedInit(const char *resource, Ceed *ceed) {
   CeedCall(CeedCalloc(sizeof(f_offsets), &(*ceed)->f_offsets));
   memcpy((*ceed)->f_offsets, f_offsets, sizeof(f_offsets));
 
-  // Set fallback for advanced CeedOperator functions
-  const char fallback_resource[] = "";
-  CeedCall(CeedSetOperatorFallbackResource(*ceed, fallback_resource));
-
   // Record env variables CEED_DEBUG or DBG
   (*ceed)->is_debug = getenv("CEED_DEBUG") || getenv("DEBUG") || getenv("DBG");
 
@@ -1004,6 +1361,16 @@ int CeedInit(const char *resource, Ceed *ceed) {
   // Note: there will always be the default root for every Ceed but all additional paths are added to the top-most parent
   CeedCall(CeedAddJitSourceRoot(*ceed, (char *)CeedJitSourceRootDefault));
 
+  // By default, make cuda compile without clang, use nvrtc instead
+  // Note that this is overridden if a rust file is included (rust requires clang)
+  const char *env = getenv("GPU_CLANG");
+
+  if (env && strcmp(env, "1") == 0) {
+    (*ceed)->cuda_compile_with_clang = true;
+  } else {
+    (*ceed)->cuda_compile_with_clang = false;
+  }
+
   // Backend specific setup
   CeedCall(backends[match_index].init(&resource[match_help], *ceed));
   return CEED_ERROR_SUCCESS;
@@ -1029,6 +1396,7 @@ int CeedSetStream(Ceed ceed, void *handle) {
 
     if (delegate) CeedCall(CeedSetStream(delegate, handle));
     else return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support setting stream");
+    CeedCall(CeedDestroy(&delegate));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1092,6 +1460,7 @@ int CeedGetPreferredMemType(Ceed ceed, CeedMemType *mem_type) {
     } else {
       *mem_type = CEED_MEM_HOST;
     }
+    CeedCall(CeedDestroy(&delegate));
   }
   return CEED_ERROR_SUCCESS;
 }
@@ -1125,14 +1494,114 @@ int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root) {
   Ceed ceed_parent;
 
   CeedCall(CeedGetParent(ceed, &ceed_parent));
+  CeedCheck(!ceed_parent->num_jit_source_roots_readers, ceed, CEED_ERROR_ACCESS, "Cannot add JiT source root, read access has not been restored");
 
   CeedInt index       = ceed_parent->num_jit_source_roots;
   size_t  path_length = strlen(jit_source_root);
 
-  CeedCall(CeedRealloc(index + 1, &ceed_parent->jit_source_roots));
+  if (ceed_parent->num_jit_source_roots == ceed_parent->max_jit_source_roots) {
+    if (ceed_parent->max_jit_source_roots == 0) ceed_parent->max_jit_source_roots = 1;
+    ceed_parent->max_jit_source_roots *= 2;
+    CeedCall(CeedRealloc(ceed_parent->max_jit_source_roots, &ceed_parent->jit_source_roots));
+  }
   CeedCall(CeedCalloc(path_length + 1, &ceed_parent->jit_source_roots[index]));
   memcpy(ceed_parent->jit_source_roots[index], jit_source_root, path_length);
   ceed_parent->num_jit_source_roots++;
+  CeedCall(CeedDestroy(&ceed_parent));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Set additional Rust source root for `Ceed` context for use in QFunction
+
+  @param[in,out] ceed            `Ceed` context
+  @param[in]     rust_source_root Absolute path to additional Rust source directory
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedAddRustSourceRoot(Ceed ceed, const char *rust_source_root) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  CeedCheck(!ceed_parent->num_rust_source_roots_readers, ceed, CEED_ERROR_ACCESS, "Cannot add Rust source root, read access has not been restored");
+
+  CeedInt index       = ceed_parent->num_rust_source_roots;
+  size_t  path_length = strlen(rust_source_root);
+
+  if (ceed_parent->num_rust_source_roots == ceed_parent->max_rust_source_roots) {
+    if (ceed_parent->max_rust_source_roots == 0) ceed_parent->max_rust_source_roots = 1;
+    ceed_parent->max_rust_source_roots *= 2;
+    CeedCall(CeedRealloc(ceed_parent->max_rust_source_roots, &ceed_parent->rust_source_roots));
+  }
+  CeedCall(CeedCalloc(path_length + 1, &ceed_parent->rust_source_roots[index]));
+  memcpy(ceed_parent->rust_source_roots[index], rust_source_root, path_length);
+  ceed_parent->num_rust_source_roots++;
+  ceed_parent->cuda_compile_with_clang = true;
+  ceed->cuda_compile_with_clang        = true;
+  CeedCall(CeedDestroy(&ceed_parent));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Set additional JiT compiler define for `Ceed` context
+
+  @param[in,out] ceed       `Ceed` context
+  @param[in]     jit_define String such as `foo=bar`, used as `-Dfoo=bar` in JiT
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedAddJitDefine(Ceed ceed, const char *jit_define) {
+  Ceed ceed_parent;
+
+  CeedCall(CeedGetParent(ceed, &ceed_parent));
+  CeedCheck(!ceed_parent->num_jit_defines_readers, ceed, CEED_ERROR_ACCESS, "Cannot add JiT define, read access has not been restored");
+
+  CeedInt index         = ceed_parent->num_jit_defines;
+  size_t  define_length = strlen(jit_define);
+
+  if (ceed_parent->num_jit_defines == ceed_parent->max_jit_defines) {
+    if (ceed_parent->max_jit_defines == 0) ceed_parent->max_jit_defines = 1;
+    ceed_parent->max_jit_defines *= 2;
+    CeedCall(CeedRealloc(ceed_parent->max_jit_defines, &ceed_parent->jit_defines));
+  }
+  CeedCall(CeedCalloc(define_length + 1, &ceed_parent->jit_defines[index]));
+  memcpy(ceed_parent->jit_defines[index], jit_define, define_length);
+  ceed_parent->num_jit_defines++;
+  CeedCall(CeedDestroy(&ceed_parent));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Set the number of tabs to indent for @ref CeedView() output
+
+  @param[in] ceed     `Ceed` to set the number of view tabs
+  @param[in] num_tabs Number of view tabs to set
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedSetNumViewTabs(Ceed ceed, CeedInt num_tabs) {
+  CeedCall(CeedObjectSetNumViewTabs((CeedObject)ceed, num_tabs));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get the number of tabs to indent for @ref CeedView() output
+
+  @param[in]  ceed     `Ceed` to get the number of view tabs
+  @param[out] num_tabs Number of view tabs
+
+  @return Error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedGetNumViewTabs(Ceed ceed, CeedInt *num_tabs) {
+  CeedCall(CeedObjectGetNumViewTabs((CeedObject)ceed, num_tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1147,15 +1616,24 @@ int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root) {
   @ref User
 **/
 int CeedView(Ceed ceed, FILE *stream) {
+  char       *tabs = NULL;
   CeedMemType mem_type;
 
   CeedCall(CeedGetPreferredMemType(ceed, &mem_type));
 
+  {
+    CeedInt num_tabs = 0;
+
+    CeedCall(CeedGetNumViewTabs(ceed, &num_tabs));
+    CeedCall(CeedCalloc(CEED_TAB_WIDTH * num_tabs + 1, &tabs));
+    for (CeedInt i = 0; i < CEED_TAB_WIDTH * num_tabs; i++) tabs[i] = ' ';
+  }
   fprintf(stream,
-          "Ceed\n"
-          "  Ceed Resource: %s\n"
-          "  Preferred MemType: %s\n",
-          ceed->resource, CeedMemTypes[mem_type]);
+          "%sCeed\n"
+          "%s  Ceed Resource: %s\n"
+          "%s  Preferred MemType: %s\n",
+          tabs, tabs, ceed->resource, tabs, CeedMemTypes[mem_type]);
+  CeedCall(CeedFree(&tabs));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1169,10 +1647,15 @@ int CeedView(Ceed ceed, FILE *stream) {
   @ref User
 **/
 int CeedDestroy(Ceed *ceed) {
-  if (!*ceed || --(*ceed)->ref_count > 0) {
+  if (!*ceed || CeedObjectDereference((CeedObject)*ceed) > 0) {
     *ceed = NULL;
     return CEED_ERROR_SUCCESS;
   }
+
+  CeedCheck(!(*ceed)->num_jit_source_roots_readers, *ceed, CEED_ERROR_ACCESS,
+            "Cannot destroy ceed context, read access for JiT source roots has been granted");
+  CeedCheck(!(*ceed)->num_jit_defines_readers, *ceed, CEED_ERROR_ACCESS, "Cannot add JiT source root, read access for JiT defines has been granted");
+
   if ((*ceed)->delegate) CeedCall(CeedDestroy(&(*ceed)->delegate));
 
   if ((*ceed)->obj_delegate_count > 0) {
@@ -1190,10 +1673,21 @@ int CeedDestroy(Ceed *ceed) {
   }
   CeedCall(CeedFree(&(*ceed)->jit_source_roots));
 
+  for (CeedInt i = 0; i < (*ceed)->num_jit_defines; i++) {
+    CeedCall(CeedFree(&(*ceed)->jit_defines[i]));
+  }
+  CeedCall(CeedFree(&(*ceed)->jit_defines));
+
+  for (CeedInt i = 0; i < (*ceed)->num_rust_source_roots; i++) {
+    CeedCall(CeedFree(&(*ceed)->rust_source_roots[i]));
+  }
+  CeedCall(CeedFree(&(*ceed)->rust_source_roots));
+
   CeedCall(CeedFree(&(*ceed)->f_offsets));
   CeedCall(CeedFree(&(*ceed)->resource));
   CeedCall(CeedDestroy(&(*ceed)->op_fallback_ceed));
-  CeedCall(CeedFree(&(*ceed)->op_fallback_resource));
+  CeedCall(CeedWorkVectorsDestroy(*ceed));
+  CeedCall(CeedObjectDestroy_Private(&(*ceed)->obj));
   CeedCall(CeedFree(ceed));
   return CEED_ERROR_SUCCESS;
 }
@@ -1201,7 +1695,6 @@ int CeedDestroy(Ceed *ceed) {
 // LCOV_EXCL_START
 const char *CeedErrorFormat(Ceed ceed, const char *format, va_list *args) {
   if (ceed->parent) return CeedErrorFormat(ceed->parent, format, args);
-  if (ceed->op_fallback_parent) return CeedErrorFormat(ceed->op_fallback_parent, format, args);
   // Using pointer to va_list for better FFI, but clang-tidy can't verify va_list is initalized
   vsnprintf(ceed->err_msg, CEED_MAX_RESOURCE_LEN, format, *args);  // NOLINT
   return ceed->err_msg;
@@ -1265,7 +1758,6 @@ int CeedErrorReturn(Ceed ceed, const char *filename, int line_no, const char *fu
 // LCOV_EXCL_START
 int CeedErrorStore(Ceed ceed, const char *filename, int line_no, const char *func, int err_code, const char *format, va_list *args) {
   if (ceed->parent) return CeedErrorStore(ceed->parent, filename, line_no, func, err_code, format, args);
-  if (ceed->op_fallback_parent) return CeedErrorStore(ceed->op_fallback_parent, filename, line_no, func, err_code, format, args);
 
   // Build message
   int len = snprintf(ceed->err_msg, CEED_MAX_RESOURCE_LEN, "%s:%d in %s(): ", filename, line_no, func);
@@ -1345,7 +1837,6 @@ int CeedSetErrorHandler(Ceed ceed, CeedErrorHandler handler) {
 **/
 int CeedGetErrorMessage(Ceed ceed, const char **err_msg) {
   if (ceed->parent) return CeedGetErrorMessage(ceed->parent, err_msg);
-  if (ceed->op_fallback_parent) return CeedGetErrorMessage(ceed->op_fallback_parent, err_msg);
   *err_msg = ceed->err_msg;
   return CEED_ERROR_SUCCESS;
 }
@@ -1364,7 +1855,6 @@ int CeedGetErrorMessage(Ceed ceed, const char **err_msg) {
 **/
 int CeedResetErrorMessage(Ceed ceed, const char **err_msg) {
   if (ceed->parent) return CeedResetErrorMessage(ceed->parent, err_msg);
-  if (ceed->op_fallback_parent) return CeedResetErrorMessage(ceed->op_fallback_parent, err_msg);
   *err_msg = NULL;
   memcpy(ceed->err_msg, "No error message stored", 24);
   return CEED_ERROR_SUCCESS;
@@ -1387,7 +1877,7 @@ int CeedResetErrorMessage(Ceed ceed, const char **err_msg) {
 
   @ref Developer
 
-  @sa CEED_VERSION_GE()
+  @sa CEED_VERSION_GE() CeedGetGitVersion() CeedGetBuildConfiguration()
 */
 int CeedGetVersion(int *major, int *minor, int *patch, bool *release) {
   if (major) *major = CEED_VERSION_MAJOR;
diff --git a/julia/LibCEED.jl/examples/ex3-volume.jl b/julia/LibCEED.jl/examples/ex3-volume.jl
new file mode 100644
index 0000000000..68edf59817
--- /dev/null
+++ b/julia/LibCEED.jl/examples/ex3-volume.jl
@@ -0,0 +1,197 @@
+using LibCEED, Printf
+
+include("common.jl")
+
+function transform_mesh_coords!(dim, mesh_size, mesh_coords)
+    @witharray coords = mesh_coords begin
+        if dim == 1
+            for i = 1:mesh_size
+                # map [0,1] to [0,1] varying the mesh density
+                coords[i] = 0.5 + 1.0/sqrt(3.0)*sin((2.0/3.0)*pi*(coords[i] - 0.5))
+            end
+            exact_volume = 1.0
+        else
+            num_nodes = mesh_size÷dim
+            @inbounds @simd for i = 1:num_nodes
+                # map (x,y) from [0,1]x[0,1] to the quarter annulus with polar
+                # coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi
+                u = coords[i]
+                v = coords[i+num_nodes]
+                u = 1.0 + u
+                v = pi/2*v
+                coords[i] = u*cos(v)
+                coords[i+num_nodes] = u*sin(v)
+            end
+            exact_volume = 3.0/4.0*pi
+        end
+        return exact_volume
+    end
+end
+
+function run_ex3(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size)
+    ncompx = dim
+    prob_size < 0 && (prob_size = 256*1024)
+
+    ceed = Ceed(ceed_spec)
+    mesh_basis =
+        create_tensor_h1_lagrange_basis(ceed, dim, ncompx, mesh_order + 1, num_qpts, GAUSS)
+    sol_basis =
+        create_tensor_h1_lagrange_basis(ceed, dim, 1, sol_order + 1, num_qpts, GAUSS)
+
+    # Determine the mesh size based on the given approximate problem size.
+    nxyz = get_cartesian_mesh_size(dim, sol_order, prob_size)
+    println("Mesh size: ", nxyz)
+
+    # Build CeedElemRestriction objects describing the mesh and solution discrete
+    # representations.
+    mesh_size, mesh_rstr, _ =
+        build_cartesian_restriction(ceed, dim, nxyz, mesh_order, ncompx, num_qpts)
+    num_q_comp = 1 + div(dim*(dim + 1), 2)
+    sol_size, _, qdata_rstr_i = build_cartesian_restriction(
+        ceed,
+        dim,
+        nxyz,
+        sol_order,
+        num_q_comp,
+        num_qpts,
+        mode=StridedOnly,
+    )
+    sol_size, sol_rstr, sol_rstr_i = build_cartesian_restriction(
+        ceed,
+        dim,
+        nxyz,
+        sol_order,
+        1,
+        num_qpts,
+        mode=RestrictionAndStrided,
+    )
+    println("Number of mesh nodes     : ", div(mesh_size, dim))
+    println("Number of solution nodes : ", sol_size)
+
+    # Create a CeedVector with the mesh coordinates.
+    mesh_coords = CeedVector(ceed, mesh_size)
+    set_cartesian_mesh_coords!(dim, nxyz, mesh_order, mesh_coords)
+    # Apply a transformation to the mesh.
+    exact_vol = transform_mesh_coords!(dim, mesh_size, mesh_coords)
+
+    #Create the Q-function that builds the mass+diffusion operator ( i.e it computes the quadrature data) and set its context data.
+    @interior_qf build_qfunc = (
+        ceed,
+        dim=dim,
+        (dx, :in, EVAL_GRAD, dim, dim),      # ← THIS LINE: dx input
+        (weights, :in, EVAL_WEIGHT),         # ← weights input
+        (qdata, :out, EVAL_NONE, num_q_comp), # ← qdata output
+        begin
+            # Compute determinant
+            det_J = det(dx)
+
+            # Store mass component
+            qdata[1] = weights*det_J
+
+            # Store diffusion components (J^T * J)
+            idx = 2
+            for i = 1:dim
+                for j = i:dim
+                    qdata[idx] = dx[:, i]'*dx[:, j]
+                    idx += 1
+                end
+            end
+        end,
+    )
+
+    # Create the operator that builds the quadrature data for the mass+diffusion operator.
+    build_oper = Operator(
+        ceed,
+        qf=build_qfunc,
+        fields=[
+            (:dx, mesh_rstr, mesh_basis, CeedVectorActive()),
+            (:weights, ElemRestrictionNone(), mesh_basis, CeedVectorNone()),
+            (:qdata, qdata_rstr_i, BasisNone(), CeedVectorActive()),
+        ],
+    )
+
+    # Compute the quadrature data for the mass+diff operator.
+    elem_qpts = num_qpts^dim
+    num_elem = prod(nxyz)
+    qdata = CeedVector(ceed, num_elem*elem_qpts*num_q_comp)
+    print("Computing the quadrature data for the mass+diffusion operator ...")
+    flush(stdout)
+    apply!(build_oper, mesh_coords, qdata)
+    println(" done.")
+
+    # Create the Q-function that defines the action of the mass+diffusion operator.
+    @interior_qf apply_qfunc = (
+        ceed,
+        dim=dim,
+        (u, :in, EVAL_INTERP),
+        (du, :in, EVAL_GRAD, dim),
+        (qdata, :in, EVAL_NONE, num_q_comp),
+        (v, :out, EVAL_INTERP),
+        (dv, :out, EVAL_GRAD, dim),
+        begin
+            # Apply mass: v = qdata[1] * u
+            v .= qdata[1].*u
+
+            # Apply diffusion: dv = (qdata[2:end]) * du
+            # The qdata contains the symmetric diffusion tensor (J^T*J)
+            # dv_i = sum_j (J^T*J)_{i,j} * du_j
+
+            # For efficiency, rebuild the matrix from stored components
+            idx = 2
+            for i = 1:dim
+                dv_i = 0.0
+                for j = 1:dim
+                    # Reconstruct symmetric matrix element
+                    if j >= i
+                        mat_idx = idx + div((j - 1)*j, 2) + (i - 1)
+                    else
+                        mat_idx = idx + div((i - 1)*i, 2) + (j - 1)
+                    end
+                    dv_i += qdata[mat_idx]*du[j]
+                end
+                dv[i] = dv_i
+            end
+        end,
+    )
+
+    # Create the mass+diffusion operator.
+    oper = Operator(
+        ceed,
+        qf=apply_qfunc,
+        fields=[
+            (:u, sol_rstr, sol_basis, CeedVectorActive()),
+            (:du, sol_rstr, sol_basis, CeedVectorActive()),
+            (:qdata, qdata_rstr_i, BasisNone(), qdata),
+            (:v, sol_rstr, sol_basis, CeedVectorActive()),
+            (:dv, sol_rstr, sol_basis, CeedVectorActive()),
+        ],
+    )
+
+    # Compute the mesh volume using the mass+diffusion operator: vol = 1^T \cdot (M + K) \cdot 1
+    print("Computing the mesh volume using the formula: vol = 1^T * (M + K) * 1...")
+    flush(stdout)
+    # Create auxiliary solution-size vectors.
+    u = CeedVector(ceed, sol_size)
+    v = CeedVector(ceed, sol_size)
+    # Initialize 'u' with ones.
+    u[] = 1.0
+    # Apply the mass+diffusion operator: 'u' -> 'v'.
+    apply!(oper, u, v)
+    # Compute and print the sum of the entries of 'v' giving the mesh volume.
+    vol = witharray_read(sum, v, MEM_HOST)
+
+    println(" done.")
+    @printf("Exact mesh volume    : % .14g\n", exact_vol)
+    @printf("Computed mesh volume : % .14g\n", vol)
+    @printf("Volume error         : % .14g\n", vol - exact_vol)
+end
+
+# Entry point
+run_ex3(
+    ceed_spec="/cpu/self",
+    dim=3,
+    mesh_order=4,
+    sol_order=4,
+    num_qpts=4 + 2,
+    prob_size=-1,
+)
diff --git a/julia/LibCEED.jl/src/Operator.jl b/julia/LibCEED.jl/src/Operator.jl
index d1de710c54..2ad8f41b1c 100644
--- a/julia/LibCEED.jl/src/Operator.jl
+++ b/julia/LibCEED.jl/src/Operator.jl
@@ -69,11 +69,11 @@ collection `ops`.
 """
 function create_composite_operator(c::Ceed, ops)
     ref = Ref{C.CeedOperator}()
-    C.CeedCompositeOperatorCreate(c[], ref)
+    C.CeedOperatorCreateComposite(c[], ref)
     comp_op = Operator(ref, QFunctionNone(), QFunctionNone(), QFunctionNone())
     comp_op.sub_ops = ops
     for op ∈ ops
-        C.CeedCompositeOperatorAddSub(comp_op[], op[])
+        C.CeedOperatorCompositeAddSub(comp_op[], op[])
     end
     comp_op
 end
diff --git a/julia/LibCEED.jl/src/generated/libceed_bindings.jl b/julia/LibCEED.jl/src/generated/libceed_bindings.jl
index f814609b86..d4bba38974 100644
--- a/julia/LibCEED.jl/src/generated/libceed_bindings.jl
+++ b/julia/LibCEED.jl/src/generated/libceed_bindings.jl
@@ -436,8 +436,8 @@ function CeedBasisApply(basis, num_elem, t_mode, eval_mode, u, v)
     ccall((:CeedBasisApply, libceed), Cint, (CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector), basis, num_elem, t_mode, eval_mode, u, v)
 end
 
-function CeedBasisApplyAtPoints(basis, num_points, t_mode, eval_mode, x_ref, u, v)
-    ccall((:CeedBasisApplyAtPoints, libceed), Cint, (CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector), basis, num_points, t_mode, eval_mode, x_ref, u, v)
+function CeedBasisApplyAtPoints(basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)
+    ccall((:CeedBasisApplyAtPoints, libceed), Cint, (CeedBasis, CeedInt, Ptr{CeedInt}, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector, CeedVector), basis, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)
 end
 
 function CeedBasisGetCeed(basis, ceed)
@@ -658,8 +658,8 @@ function CeedOperatorCreate(ceed, qf, dqf, dqfT, op)
     ccall((:CeedOperatorCreate, libceed), Cint, (Ceed, CeedQFunction, CeedQFunction, CeedQFunction, Ptr{CeedOperator}), ceed, qf, dqf, dqfT, op)
 end
 
-function CeedCompositeOperatorCreate(ceed, op)
-    ccall((:CeedCompositeOperatorCreate, libceed), Cint, (Ceed, Ptr{CeedOperator}), ceed, op)
+function CeedOperatorCreateComposite(ceed, op)
+    ccall((:CeedOperatorCreateComposite, libceed), Cint, (Ceed, Ptr{CeedOperator}), ceed, op)
 end
 
 function CeedOperatorReferenceCopy(op, op_copy)
@@ -674,16 +674,16 @@ function CeedOperatorGetFields(op, num_input_fields, input_fields, num_output_fi
     ccall((:CeedOperatorGetFields, libceed), Cint, (CeedOperator, Ptr{CeedInt}, Ptr{Ptr{CeedOperatorField}}, Ptr{CeedInt}, Ptr{Ptr{CeedOperatorField}}), op, num_input_fields, input_fields, num_output_fields, output_fields)
 end
 
-function CeedCompositeOperatorAddSub(composite_op, sub_op)
-    ccall((:CeedCompositeOperatorAddSub, libceed), Cint, (CeedOperator, CeedOperator), composite_op, sub_op)
+function CeedOperatorCompositeAddSub(composite_op, sub_op)
+    ccall((:CeedOperatorCompositeAddSub, libceed), Cint, (CeedOperator, CeedOperator), composite_op, sub_op)
 end
 
-function CeedCompositeOperatorGetNumSub(op, num_suboperators)
-    ccall((:CeedCompositeOperatorGetNumSub, libceed), Cint, (CeedOperator, Ptr{CeedInt}), op, num_suboperators)
+function CeedOperatorCompositeGetNumSub(op, num_suboperators)
+    ccall((:CeedOperatorCompositeGetNumSub, libceed), Cint, (CeedOperator, Ptr{CeedInt}), op, num_suboperators)
 end
 
-function CeedCompositeOperatorGetSubList(op, sub_operators)
-    ccall((:CeedCompositeOperatorGetSubList, libceed), Cint, (CeedOperator, Ptr{Ptr{CeedOperator}}), op, sub_operators)
+function CeedOperatorCompositeGetSubList(op, sub_operators)
+    ccall((:CeedOperatorCompositeGetSubList, libceed), Cint, (CeedOperator, Ptr{Ptr{CeedOperator}}), op, sub_operators)
 end
 
 function CeedOperatorCheckReady(op)
@@ -738,8 +738,8 @@ function CeedOperatorLinearAssemble(op, values)
     ccall((:CeedOperatorLinearAssemble, libceed), Cint, (CeedOperator, CeedVector), op, values)
 end
 
-function CeedCompositeOperatorGetMultiplicity(op, num_skip_indices, skip_indices, mult)
-    ccall((:CeedCompositeOperatorGetMultiplicity, libceed), Cint, (CeedOperator, CeedInt, Ptr{CeedInt}, CeedVector), op, num_skip_indices, skip_indices, mult)
+function CeedOperatorCompositeGetMultiplicity(op, num_skip_indices, skip_indices, mult)
+    ccall((:CeedOperatorCompositeGetMultiplicity, libceed), Cint, (CeedOperator, CeedInt, Ptr{CeedInt}, CeedVector), op, num_skip_indices, skip_indices, mult)
 end
 
 function CeedOperatorMultigridLevelCreate(op_fine, p_mult_fine, rstr_coarse, basis_coarse, op_coarse, op_prolong, op_restrict)
diff --git a/julia/LibCEED.jl/test/rundevtests.jl b/julia/LibCEED.jl/test/rundevtests.jl
index 9527d8d2dc..59d0e4840e 100644
--- a/julia/LibCEED.jl/test/rundevtests.jl
+++ b/julia/LibCEED.jl/test/rundevtests.jl
@@ -8,4 +8,36 @@ function checkoutput(str, fname)
     return true
 end
 
-@testset "LibCEED Development Tests" begin end
+@testset "LibCEED Development Tests" begin
+    @testset "Operator" begin
+        c = Ceed()
+        @interior_qf id = (
+            c,
+            (input, :in, EVAL_INTERP),
+            (output, :out, EVAL_INTERP),
+            begin
+                output[] = input
+            end,
+        )
+        b = create_tensor_h1_lagrange_basis(c, 3, 1, 3, 3, GAUSS_LOBATTO)
+        n = getnumnodes(b)
+        offsets = Vector{CeedInt}(0:n-1)
+        r = create_elem_restriction(c, 1, n, 1, 1, n, offsets)
+        op = Operator(
+            c;
+            qf=id,
+            fields=[
+                (:input, r, b, CeedVectorActive()),
+                (:output, r, b, CeedVectorActive()),
+            ],
+        )
+
+        v = rand(CeedScalar, n)
+        v1 = CeedVector(c, v)
+        v2 = CeedVector(c, n)
+
+        comp_op = create_composite_operator(c, [op])
+        apply!(comp_op, v1, v2)
+        @test @witharray_read(a1 = v1, @witharray_read(a2 = v2, a1 == a2))
+    end
+end
diff --git a/julia/LibCEED.jl/test/runtests.jl b/julia/LibCEED.jl/test/runtests.jl
index 83c7598ecd..724240d786 100644
--- a/julia/LibCEED.jl/test/runtests.jl
+++ b/julia/LibCEED.jl/test/runtests.jl
@@ -256,10 +256,6 @@ else
             LibCEED.assemble_add_diagonal!(op, diag_vector)
             @test @witharray(a = diag_vector, a == fill(1.0, n))
 
-            comp_op = create_composite_operator(c, [op])
-            apply!(comp_op, v1, v2)
-            @test @witharray_read(a1 = v1, @witharray_read(a2 = v2, a1 == a2))
-
             @test showstr(op) == """
                 CeedOperator
                   1 elements with 27 quadrature points each
diff --git a/python/__init__.py b/python/__init__.py
index 9c6560addb..c5eb31d18e 100644
--- a/python/__init__.py
+++ b/python/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/build_ceed_cffi.py b/python/build_ceed_cffi.py
index 6a77781d25..71c99a21f4 100644
--- a/python/build_ceed_cffi.py
+++ b/python/build_ceed_cffi.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
@@ -13,6 +13,29 @@
 ceed_version_ge = re.compile(r'\s+\(!?CEED_VERSION.*')
 
 
+# Checks to see if a c line is part of the lines we have to exclude (macros)
+def is_valid_line(line):
+    if (line.startswith("#") and not line.startswith("#include")):
+        return False
+    if (line.startswith("#include \"deprecated.h\"")):
+        return False
+    if (line.startswith("  CEED_QFUNCTION_ATTR")):
+        return False
+    if (line.startswith("  static const char")):
+        return False
+    if (line.endswith('\\\n')):
+        return False
+    if ("CeedErrorImpl" in line):
+        return False
+    if (r'const char *, ...);' in line):
+        return False
+    if (line.startswith("CEED_EXTERN const char *const")):
+        return False
+    if (ceed_version_ge.match(line)):
+        return False
+    return True
+
+
 def get_ceed_dirs():
     here = os.path.dirname(os.path.abspath(__file__))
     prefix = os.path.dirname(here)
@@ -31,14 +54,7 @@ def get_ceed_dirs():
 lines = []
 for header_path in ["include/ceed/types.h", "include/ceed/ceed.h"]:
     with open(os.path.abspath(header_path)) as f:
-        lines += [line.strip() for line in f if
-                  not (line.startswith("#") and not line.startswith("#include")) and
-                  not line.startswith("  static") and
-                  not line.startswith("  CEED_QFUNCTION_ATTR") and
-                  "CeedErrorImpl" not in line and
-                  "const char *, ...);" not in line and
-                  not line.startswith("CEED_EXTERN const char *const") and
-                  not ceed_version_ge.match(line)]
+        lines += [line.strip() for line in f if is_valid_line(line)]
 lines = [line.replace("CEED_EXTERN", "extern") for line in lines]
 
 # Find scalar type inclusion line and insert definitions
diff --git a/python/ceed.py b/python/ceed.py
index 092cd1d047..8df025acae 100644
--- a/python/ceed.py
+++ b/python/ceed.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_basis.py b/python/ceed_basis.py
index e1c12def62..c4f71a7089 100644
--- a/python/ceed_basis.py
+++ b/python/ceed_basis.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_constants.py b/python/ceed_constants.py
index ab99f7b643..8b4ea22673 100644
--- a/python/ceed_constants.py
+++ b/python/ceed_constants.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_elemrestriction.py b/python/ceed_elemrestriction.py
index 9c986eb58e..42e72a9311 100644
--- a/python/ceed_elemrestriction.py
+++ b/python/ceed_elemrestriction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_operator.py b/python/ceed_operator.py
index 740beef641..90c3549f36 100644
--- a/python/ceed_operator.py
+++ b/python/ceed_operator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
@@ -331,7 +331,7 @@ def __init__(self, ceed):
         # Reference to Ceed
         self._ceed = ceed
         # libCEED call
-        err_code = lib.CeedCompositeOperatorCreate(
+        err_code = lib.CeedOperatorCreateComposite(
             self._ceed._pointer[0], self._pointer)
         self._ceed._check_error(err_code)
 
@@ -343,7 +343,7 @@ def add_sub(self, subop):
              subop: sub-operator Operator"""
 
         # libCEED call
-        err_code = lib.CeedCompositeOperatorAddSub(
+        err_code = lib.CeedOperatorCompositeAddSub(
             self._pointer[0], subop._pointer[0])
         self._ceed._check_error(err_code)
 
diff --git a/python/ceed_qfunction.py b/python/ceed_qfunction.py
index 896d69bfd4..9c73581ae4 100644
--- a/python/ceed_qfunction.py
+++ b/python/ceed_qfunction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_qfunctioncontext.py b/python/ceed_qfunctioncontext.py
index 92c072bdd2..b98863aa7d 100644
--- a/python/ceed_qfunctioncontext.py
+++ b/python/ceed_qfunctioncontext.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/ceed_vector.py b/python/ceed_vector.py
index c72bb265ad..06bd693ec6 100644
--- a/python/ceed_vector.py
+++ b/python/ceed_vector.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/Makefile b/python/tests/Makefile
index 918d1551ef..94c49d5b3a 100644
--- a/python/tests/Makefile
+++ b/python/tests/Makefile
@@ -1,10 +1,12 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
 #
 # This file is part of CEED:  http://github.com/ceed
 
+PYTHON ?= python3
+
 clean:
 	rm -rf build __pycache__ .pytest_cache *.so
 
diff --git a/python/tests/conftest.py b/python/tests/conftest.py
index 62e8f4bb1d..70bdf69cfc 100644
--- a/python/tests/conftest.py
+++ b/python/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/libceed-qfunctions.c b/python/tests/libceed-qfunctions.c
index bef055452a..14fdfa6749 100644
--- a/python/tests/libceed-qfunctions.c
+++ b/python/tests/libceed-qfunctions.c
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/output/test_504.out b/python/tests/output/test_504.out
index 3e1d828503..3b8acf130a 100644
--- a/python/tests/output/test_504.out
+++ b/python/tests/output/test_504.out
@@ -1,4 +1,4 @@
-CeedOperator
+CeedOperator - setup_mass
   15 elements with 8 quadrature points each
   3 fields
   2 input fields:
@@ -20,7 +20,7 @@ CeedOperator
       No basis
       Active vector
 
-CeedOperator
+CeedOperator - apply_mass
   15 elements with 8 quadrature points each
   3 fields
   2 input fields:
diff --git a/python/tests/setup-qfunctions.py b/python/tests/setup-qfunctions.py
index aab21d830a..74074b67c3 100644
--- a/python/tests/setup-qfunctions.py
+++ b/python/tests/setup-qfunctions.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
@@ -16,7 +16,7 @@
 qf_module = Extension("libceed_qfunctions",
                       include_dirs=[os.path.join(CEED_DIR, 'include')],
                       sources=["libceed-qfunctions.c"],
-                      extra_compile_args=["-O3", "-std=c99",
+                      extra_compile_args=["-O3", "-std=c11",
                                           "-Wno-unused-variable",
                                           "-Wno-unused-function"])
 
diff --git a/python/tests/setup.cfg b/python/tests/setup.cfg
index e0bbfb441c..7290d8e331 100644
--- a/python/tests/setup.cfg
+++ b/python/tests/setup.cfg
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-0-ceed.py b/python/tests/test-0-ceed.py
index b38d31a332..5ab30e1fd9 100644
--- a/python/tests/test-0-ceed.py
+++ b/python/tests/test-0-ceed.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
@@ -20,7 +20,7 @@ def test_000(ceed_resource):
     ceed = libceed.Ceed(ceed_resource)
 
 # -------------------------------------------------------------------------------
-# Test return of Ceed backend prefered memory type
+# Test return of Ceed backend preferred memory type
 # -------------------------------------------------------------------------------
 
 
diff --git a/python/tests/test-1-vector.py b/python/tests/test-1-vector.py
index 246b82515e..9838a35b30 100644
--- a/python/tests/test-1-vector.py
+++ b/python/tests/test-1-vector.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
@@ -359,7 +359,7 @@ def test_126(ceed_resource, capsys):
     a = np.arange(10, 10 + n, dtype=ceed.scalar_type())
     x.set_array(a, cmode=libceed.USE_POINTER)
 
-    a2 = np.arange(10, n, dtype=ceed.scalar_type())
+    a2 = np.arange(0, n, dtype=ceed.scalar_type())
     y.set_array(a2, cmode=libceed.USE_POINTER)
 
     y.copy_from(x)
diff --git a/python/tests/test-2-elemrestriction.py b/python/tests/test-2-elemrestriction.py
index 6f9b1a3c38..60feb73626 100644
--- a/python/tests/test-2-elemrestriction.py
+++ b/python/tests/test-2-elemrestriction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-3-basis.py b/python/tests/test-3-basis.py
index 453e0b8401..aaded78b21 100644
--- a/python/tests/test-3-basis.py
+++ b/python/tests/test-3-basis.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-4-qfunction.py b/python/tests/test-4-qfunction.py
index 42dd844e2f..0491a2c624 100644
--- a/python/tests/test-4-qfunction.py
+++ b/python/tests/test-4-qfunction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-5-operator.py b/python/tests/test-5-operator.py
index 7127fe395e..1b67bdab2d 100644
--- a/python/tests/test-5-operator.py
+++ b/python/tests/test-5-operator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+# Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 # All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 #
 # SPDX-License-Identifier: BSD-2-Clause
diff --git a/python/tests/test-qfunctions.h b/python/tests/test-qfunctions.h
index eb9a5f3f1d..5790d540aa 100644
--- a/python/tests/test-qfunctions.h
+++ b/python/tests/test-qfunctions.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/rust/libceed-sys/build.rs b/rust/libceed-sys/build.rs
index d1cc93be6e..8c510fcc8c 100644
--- a/rust/libceed-sys/build.rs
+++ b/rust/libceed-sys/build.rs
@@ -14,6 +14,7 @@ fn main() {
     } else {
         // Install libceed.a or libceed.so to $OUT_DIR/lib
         let makeflags = env("CARGO_MAKEFLAGS").unwrap();
+        let optflags = env("CARGO_CEED_OPT_FLAGS").unwrap_or_else(|| "".to_string());
         let mut make = Command::new("make");
         make.arg("install")
             .arg(format!("prefix={}", out_dir.to_string_lossy()))
@@ -28,6 +29,9 @@ fn main() {
             .arg("FC=") // Don't try to find Fortran (unused library build/install)
             .env("MAKEFLAGS", makeflags)
             .current_dir("c-src");
+        if optflags.len() > 0 {
+            make.env("OPT", optflags);
+        }
         if statik {
             make.arg("STATIC=1");
         }
diff --git a/rust/libceed-sys/src/lib.rs b/rust/libceed-sys/src/lib.rs
index 1279fdddf5..21dff0d343 100644
--- a/rust/libceed-sys/src/lib.rs
+++ b/rust/libceed-sys/src/lib.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -17,5 +17,7 @@ pub mod bind_ceed {
     #![allow(non_upper_case_globals)]
     #![allow(non_camel_case_types)]
     #![allow(dead_code)]
+    #![allow(clippy::too_long_first_doc_paragraph)]
+    #![allow(non_snake_case)]
     include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
 }
diff --git a/rust/libceed/src/basis.rs b/rust/libceed/src/basis.rs
index 4c11fb79b4..7018e0d462 100644
--- a/rust/libceed/src/basis.rs
+++ b/rust/libceed/src/basis.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,7 +8,7 @@
 //! A Ceed Basis defines the discrete finite element basis and associated
 //! quadrature rule.
 
-use crate::prelude::*;
+use crate::{prelude::*, vector::Vector, EvalMode, TransposeMode};
 
 // -----------------------------------------------------------------------------
 // Basis option
@@ -27,7 +27,7 @@ impl<'a> From<&'a Basis<'_>> for BasisOpt<'a> {
 }
 impl<'a> BasisOpt<'a> {
     /// Transform a Rust libCEED BasisOpt into C libCEED CeedBasis
-    pub(crate) fn to_raw(self) -> bind_ceed::CeedBasis {
+    pub(crate) fn to_raw(&self) -> bind_ceed::CeedBasis {
         match self {
             Self::Some(basis) => basis.ptr,
             Self::None => unsafe { bind_ceed::CEED_BASIS_NONE },
@@ -37,7 +37,7 @@ impl<'a> BasisOpt<'a> {
     /// Check if a BasisOpt is Some
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let b = ceed.basis_tensor_H1_Lagrange(1, 2, 3, 4, QuadMode::Gauss)?;
@@ -59,7 +59,7 @@ impl<'a> BasisOpt<'a> {
     /// Check if a BasisOpt is None
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let b = ceed.basis_tensor_H1_Lagrange(1, 2, 3, 4, QuadMode::Gauss)?;
@@ -108,7 +108,7 @@ impl<'a> fmt::Display for Basis<'a> {
     /// View a Basis
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let b = ceed.basis_tensor_H1_Lagrange(1, 2, 3, 4, QuadMode::Gauss)?;
@@ -134,6 +134,7 @@ impl<'a> fmt::Display for Basis<'a> {
 // -----------------------------------------------------------------------------
 impl<'a> Basis<'a> {
     // Constructors
+    #[allow(clippy::too_many_arguments)]
     pub fn create_tensor_H1(
         ceed: &crate::Ceed,
         dim: usize,
@@ -152,7 +153,7 @@ impl<'a> Basis<'a> {
             i32::try_from(P1d).unwrap(),
             i32::try_from(Q1d).unwrap(),
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedBasisCreateTensorH1(
                 ceed.ptr,
                 dim,
@@ -165,8 +166,14 @@ impl<'a> Basis<'a> {
                 qweight1d.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
+        Ok(Self {
+            ptr,
+            _lifeline: PhantomData,
+        })
+    }
+
+    pub(crate) unsafe fn from_raw(ptr: bind_ceed::CeedBasis) -> crate::Result<Self> {
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -189,16 +196,16 @@ impl<'a> Basis<'a> {
             i32::try_from(Q).unwrap(),
             qmode as bind_ceed::CeedQuadMode,
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedBasisCreateTensorH1Lagrange(ceed.ptr, dim, ncomp, P, Q, qmode, &mut ptr)
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
         })
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub fn create_H1(
         ceed: &crate::Ceed,
         topo: crate::ElemTopology,
@@ -217,7 +224,7 @@ impl<'a> Basis<'a> {
             i32::try_from(nnodes).unwrap(),
             i32::try_from(nqpts).unwrap(),
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedBasisCreateH1(
                 ceed.ptr,
                 topo,
@@ -230,14 +237,14 @@ impl<'a> Basis<'a> {
                 qweight.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
         })
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub fn create_Hdiv(
         ceed: &crate::Ceed,
         topo: crate::ElemTopology,
@@ -256,7 +263,7 @@ impl<'a> Basis<'a> {
             i32::try_from(nnodes).unwrap(),
             i32::try_from(nqpts).unwrap(),
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedBasisCreateHdiv(
                 ceed.ptr,
                 topo,
@@ -269,14 +276,14 @@ impl<'a> Basis<'a> {
                 qweight.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
         })
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub fn create_Hcurl(
         ceed: &crate::Ceed,
         topo: crate::ElemTopology,
@@ -295,7 +302,7 @@ impl<'a> Basis<'a> {
             i32::try_from(nnodes).unwrap(),
             i32::try_from(nqpts).unwrap(),
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedBasisCreateHcurl(
                 ceed.ptr,
                 topo,
@@ -308,22 +315,23 @@ impl<'a> Basis<'a> {
                 qweight.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
         })
     }
 
+    // Raw Ceed for error handling
+    #[doc(hidden)]
+    fn ceed(&self) -> bind_ceed::Ceed {
+        unsafe { bind_ceed::CeedBasisReturnCeed(self.ptr) }
+    }
+
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedBasisGetCeed(self.ptr, &mut ptr);
-        }
-        crate::check_error(ptr, ierr)
+        crate::check_error(|| self.ceed(), ierr)
     }
 
     /// Apply basis evaluation from nodes to quadrature points or vice versa
@@ -339,7 +347,7 @@ impl<'a> Basis<'a> {
     /// * `v`     - Output Vector
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, TransposeMode, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// const Q: usize = 6;
@@ -404,15 +412,15 @@ impl<'a> Basis<'a> {
             tmode as bind_ceed::CeedTransposeMode,
             emode as bind_ceed::CeedEvalMode,
         );
-        let ierr =
-            unsafe { bind_ceed::CeedBasisApply(self.ptr, nelem, tmode, emode, u.ptr, v.ptr) };
-        self.check_error(ierr)
+        self.check_error(unsafe {
+            bind_ceed::CeedBasisApply(self.ptr, nelem, tmode, emode, u.ptr, v.ptr)
+        })
     }
 
     /// Returns the dimension for given Basis
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let dim = 2;
@@ -432,7 +440,7 @@ impl<'a> Basis<'a> {
     /// Returns number of components for given Basis
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ncomp = 2;
@@ -452,7 +460,7 @@ impl<'a> Basis<'a> {
     /// Returns total number of nodes (in dim dimensions) of a Basis
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let p = 3;
@@ -473,7 +481,7 @@ impl<'a> Basis<'a> {
     /// Basis
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let q = 4;
@@ -502,7 +510,7 @@ impl<'a> Basis<'a> {
     /// points and weights.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, TransposeMode, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let coarse = ceed.basis_tensor_H1_Lagrange(1, 1, 2, 3, QuadMode::Gauss)?;
@@ -523,8 +531,9 @@ impl<'a> Basis<'a> {
     /// ```
     pub fn create_projection(&self, to: &Self) -> crate::Result<Self> {
         let mut ptr = std::ptr::null_mut();
-        let ierr = unsafe { bind_ceed::CeedBasisCreateProjection(self.ptr, to.ptr, &mut ptr) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe {
+            bind_ceed::CeedBasisCreateProjection(self.ptr, to.ptr, &mut ptr)
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
diff --git a/rust/libceed/src/elem_restriction.rs b/rust/libceed/src/elem_restriction.rs
index 950a840403..d251220ff7 100644
--- a/rust/libceed/src/elem_restriction.rs
+++ b/rust/libceed/src/elem_restriction.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -8,7 +8,7 @@
 //! A Ceed ElemRestriction decomposes elements and groups the degrees of freedom
 //! (dofs) according to the different elements they belong to.
 
-use crate::prelude::*;
+use crate::{prelude::*, vector::Vector, TransposeMode};
 
 // -----------------------------------------------------------------------------
 // ElemRestriction option
@@ -28,7 +28,7 @@ impl<'a> From<&'a ElemRestriction<'_>> for ElemRestrictionOpt<'a> {
 impl<'a> ElemRestrictionOpt<'a> {
     /// Transform a Rust libCEED ElemRestrictionOpt into C libCEED
     /// CeedElemRestriction
-    pub(crate) fn to_raw(self) -> bind_ceed::CeedElemRestriction {
+    pub(crate) fn to_raw(&self) -> bind_ceed::CeedElemRestriction {
         match self {
             Self::Some(rstr) => rstr.ptr,
             Self::None => unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE },
@@ -38,7 +38,7 @@ impl<'a> ElemRestrictionOpt<'a> {
     /// Check if an ElemRestrictionOpt is Some
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, ElemRestrictionOpt, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -66,7 +66,7 @@ impl<'a> ElemRestrictionOpt<'a> {
     /// Check if an ElemRestrictionOpt is None
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, ElemRestrictionOpt, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -121,7 +121,7 @@ impl<'a> fmt::Display for ElemRestriction<'a> {
     /// View an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -153,6 +153,7 @@ impl<'a> fmt::Display for ElemRestriction<'a> {
 // -----------------------------------------------------------------------------
 impl<'a> ElemRestriction<'a> {
     // Constructors
+    #[allow(clippy::too_many_arguments)]
     pub fn create(
         ceed: &crate::Ceed,
         nelem: usize,
@@ -172,7 +173,7 @@ impl<'a> ElemRestriction<'a> {
             isize::try_from(lsize).unwrap(),
             mtype as bind_ceed::CeedMemType,
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreate(
                 ceed.ptr,
                 nelem,
@@ -185,14 +186,21 @@ impl<'a> ElemRestriction<'a> {
                 offsets.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
+        Ok(Self {
+            ptr,
+            _lifeline: PhantomData,
+        })
+    }
+
+    pub(crate) unsafe fn from_raw(ptr: bind_ceed::CeedElemRestriction) -> crate::Result<Self> {
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
         })
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub fn create_oriented(
         ceed: &crate::Ceed,
         nelem: usize,
@@ -213,7 +221,7 @@ impl<'a> ElemRestriction<'a> {
             isize::try_from(lsize).unwrap(),
             mtype as bind_ceed::CeedMemType,
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreateOriented(
                 ceed.ptr,
                 nelem,
@@ -227,14 +235,14 @@ impl<'a> ElemRestriction<'a> {
                 orients.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
         })
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub fn create_curl_oriented(
         ceed: &crate::Ceed,
         nelem: usize,
@@ -255,7 +263,7 @@ impl<'a> ElemRestriction<'a> {
             isize::try_from(lsize).unwrap(),
             mtype as bind_ceed::CeedMemType,
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreateCurlOriented(
                 ceed.ptr,
                 nelem,
@@ -269,8 +277,7 @@ impl<'a> ElemRestriction<'a> {
                 curlorients.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -292,7 +299,7 @@ impl<'a> ElemRestriction<'a> {
             i32::try_from(ncomp).unwrap(),
             isize::try_from(lsize).unwrap(),
         );
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreateStrided(
                 ceed.ptr,
                 nelem,
@@ -302,28 +309,29 @@ impl<'a> ElemRestriction<'a> {
                 strides.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
         })
     }
 
+    // Raw Ceed for error handling
+    #[doc(hidden)]
+    fn ceed(&self) -> bind_ceed::Ceed {
+        unsafe { bind_ceed::CeedElemRestrictionReturnCeed(self.ptr) }
+    }
+
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedElemRestrictionGetCeed(self.ptr, &mut ptr);
-        }
-        crate::check_error(ptr, ierr)
+        crate::check_error(|| self.ceed(), ierr)
     }
 
     /// Create an Lvector for an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -343,16 +351,16 @@ impl<'a> ElemRestriction<'a> {
     pub fn create_lvector<'b>(&self) -> crate::Result<Vector<'b>> {
         let mut ptr_lvector = std::ptr::null_mut();
         let null = std::ptr::null_mut() as *mut _;
-        let ierr =
-            unsafe { bind_ceed::CeedElemRestrictionCreateVector(self.ptr, &mut ptr_lvector, null) };
-        self.check_error(ierr)?;
-        Vector::from_raw(ptr_lvector)
+        self.check_error(unsafe {
+            bind_ceed::CeedElemRestrictionCreateVector(self.ptr, &mut ptr_lvector, null)
+        })?;
+        unsafe { Vector::from_raw(ptr_lvector) }
     }
 
     /// Create an Evector for an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -372,16 +380,16 @@ impl<'a> ElemRestriction<'a> {
     pub fn create_evector<'b>(&self) -> crate::Result<Vector<'b>> {
         let mut ptr_evector = std::ptr::null_mut();
         let null = std::ptr::null_mut() as *mut _;
-        let ierr =
-            unsafe { bind_ceed::CeedElemRestrictionCreateVector(self.ptr, null, &mut ptr_evector) };
-        self.check_error(ierr)?;
-        Vector::from_raw(ptr_evector)
+        self.check_error(unsafe {
+            bind_ceed::CeedElemRestrictionCreateVector(self.ptr, null, &mut ptr_evector)
+        })?;
+        unsafe { Vector::from_raw(ptr_evector) }
     }
 
     /// Create Vectors for an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -402,12 +410,11 @@ impl<'a> ElemRestriction<'a> {
     pub fn create_vectors<'b, 'c>(&self) -> crate::Result<(Vector<'b>, Vector<'c>)> {
         let mut ptr_lvector = std::ptr::null_mut();
         let mut ptr_evector = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedElemRestrictionCreateVector(self.ptr, &mut ptr_lvector, &mut ptr_evector)
-        };
-        self.check_error(ierr)?;
-        let lvector = Vector::from_raw(ptr_lvector)?;
-        let evector = Vector::from_raw(ptr_evector)?;
+        })?;
+        let lvector = unsafe { Vector::from_raw(ptr_lvector)? };
+        let evector = unsafe { Vector::from_raw(ptr_evector)? };
         Ok((lvector, evector))
     }
 
@@ -422,7 +429,7 @@ impl<'a> ElemRestriction<'a> {
     ///               decided by the backend.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType, Scalar, TransposeMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -451,7 +458,7 @@ impl<'a> ElemRestriction<'a> {
     /// ```
     pub fn apply(&self, tmode: TransposeMode, u: &Vector, ru: &mut Vector) -> crate::Result<i32> {
         let tmode = tmode as bind_ceed::CeedTransposeMode;
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedElemRestrictionApply(
                 self.ptr,
                 tmode,
@@ -459,14 +466,13 @@ impl<'a> ElemRestriction<'a> {
                 ru.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     /// Returns the Lvector component stride
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -492,7 +498,7 @@ impl<'a> ElemRestriction<'a> {
     /// Returns the total number of elements in the range of a ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -517,7 +523,7 @@ impl<'a> ElemRestriction<'a> {
     /// Returns the size of elements in the ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -543,7 +549,7 @@ impl<'a> ElemRestriction<'a> {
     /// Returns the size of the Lvector for an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -568,7 +574,7 @@ impl<'a> ElemRestriction<'a> {
     /// Returns the number of components in the elements of an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -594,7 +600,7 @@ impl<'a> ElemRestriction<'a> {
     /// Returns the multiplicity of nodes in an ElemRestriction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -621,8 +627,9 @@ impl<'a> ElemRestriction<'a> {
     /// # }
     /// ```
     pub fn multiplicity(&self, mult: &mut Vector) -> crate::Result<i32> {
-        let ierr = unsafe { bind_ceed::CeedElemRestrictionGetMultiplicity(self.ptr, mult.ptr) };
-        self.check_error(ierr)
+        self.check_error(unsafe {
+            bind_ceed::CeedElemRestrictionGetMultiplicity(self.ptr, mult.ptr)
+        })
     }
 }
 
diff --git a/rust/libceed/src/lib.rs b/rust/libceed/src/lib.rs
index bf7de98e07..ae6487ee83 100755
--- a/rust/libceed/src/lib.rs
+++ b/rust/libceed/src/lib.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -19,18 +19,6 @@ use crate::prelude::*;
 use std::sync::Once;
 
 pub mod prelude {
-    pub use crate::{
-        basis::{self, Basis, BasisOpt},
-        elem_restriction::{self, ElemRestriction, ElemRestrictionOpt},
-        operator::{self, CompositeOperator, Operator, OperatorField},
-        qfunction::{
-            self, QFunction, QFunctionByName, QFunctionField, QFunctionInputs, QFunctionOpt,
-            QFunctionOutputs,
-        },
-        vector::{self, Vector, VectorOpt, VectorSliceWrapper},
-        ElemTopology, EvalMode, MemType, NormType, QuadMode, Scalar, TransposeMode,
-        CEED_STRIDES_BACKEND, EPSILON, MAX_QFUNCTION_FIELDS,
-    };
     pub(crate) use libceed_sys::bind_ceed;
     pub(crate) use std::convert::TryFrom;
     pub(crate) use std::ffi::{CStr, CString};
@@ -157,11 +145,27 @@ impl fmt::Display for Error {
     }
 }
 
+// -----------------------------------------------------------------------------
+// Internal crate contents
+// -----------------------------------------------------------------------------
+pub use crate::{
+    basis::{Basis, BasisOpt},
+    elem_restriction::{ElemRestriction, ElemRestrictionOpt},
+    operator::{CompositeOperator, Operator, OperatorField},
+    qfunction::{
+        QFunction, QFunctionByName, QFunctionField, QFunctionInputs, QFunctionOpt, QFunctionOutputs,
+    },
+    vector::{Vector, VectorOpt, VectorSliceWrapper},
+};
+
 // -----------------------------------------------------------------------------
 // Internal error checker
 // -----------------------------------------------------------------------------
 #[doc(hidden)]
-pub(crate) fn check_error(ceed_ptr: bind_ceed::Ceed, ierr: i32) -> Result<i32> {
+pub(crate) fn check_error<F>(ceed_ptr: F, ierr: i32) -> Result<i32>
+where
+    F: FnOnce() -> bind_ceed::Ceed,
+{
     // Return early if code is clean
     if ierr == bind_ceed::CeedErrorType_CEED_ERROR_SUCCESS {
         return Ok(ierr);
@@ -169,7 +173,7 @@ pub(crate) fn check_error(ceed_ptr: bind_ceed::Ceed, ierr: i32) -> Result<i32> {
     // Retrieve error message
     let mut ptr: *const std::os::raw::c_char = std::ptr::null_mut();
     let c_str = unsafe {
-        bind_ceed::CeedGetErrorMessage(ceed_ptr, &mut ptr);
+        bind_ceed::CeedGetErrorMessage(ceed_ptr(), &mut ptr);
         std::ffi::CStr::from_ptr(ptr)
     };
     let message = c_str.to_string_lossy().to_string();
@@ -225,8 +229,8 @@ impl Clone for Ceed {
     /// ```
     fn clone(&self) -> Self {
         let mut ptr_clone = std::ptr::null_mut();
-        let ierr = unsafe { bind_ceed::CeedReferenceCopy(self.ptr, &mut ptr_clone) };
-        self.check_error(ierr).expect("failed to clone Ceed");
+        self.check_error(unsafe { bind_ceed::CeedReferenceCopy(self.ptr, &mut ptr_clone) })
+            .expect("failed to clone Ceed");
         Self { ptr: ptr_clone }
     }
 }
@@ -305,7 +309,7 @@ impl Ceed {
 
         // Call to libCEED
         let mut ptr = std::ptr::null_mut();
-        let mut ierr = unsafe { bind_ceed::CeedInit(c_resource.as_ptr() as *const i8, &mut ptr) };
+        let mut ierr = unsafe { bind_ceed::CeedInit(c_resource.as_ptr(), &mut ptr) };
         if ierr != 0 {
             panic!("Error initializing backend resource: {}", resource)
         }
@@ -424,7 +428,7 @@ impl Ceed {
     ///                    `[0, lsize - 1]`.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -437,6 +441,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn elem_restriction<'a>(
         &self,
         nelem: usize,
@@ -483,7 +488,7 @@ impl Ceed {
     ///                    orientation.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -500,6 +505,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn oriented_elem_restriction<'a>(
         &self,
         nelem: usize,
@@ -547,7 +553,7 @@ impl Ceed {
     ///                     unknowns upon restriction.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let nelem = 3;
@@ -586,6 +592,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn curl_oriented_elem_restriction<'a>(
         &self,
         nelem: usize,
@@ -687,6 +694,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn basis_tensor_H1<'a>(
         &self,
         dim: usize,
@@ -716,7 +724,7 @@ impl Ceed {
     ///               accuracy for the quadrature)
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QuadMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let b = ceed.basis_tensor_H1_Lagrange(2, 1, 3, 4, QuadMode::Gauss)?;
@@ -752,7 +760,7 @@ impl Ceed {
     ///                 the reference element
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, ElemTopology};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let interp = [
@@ -849,6 +857,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn basis_H1<'a>(
         &self,
         topo: ElemTopology,
@@ -883,7 +892,7 @@ impl Ceed {
     ///                 the reference element
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, ElemTopology};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let interp = [
@@ -947,6 +956,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn basis_Hdiv<'a>(
         &self,
         topo: ElemTopology,
@@ -980,7 +990,7 @@ impl Ceed {
     ///                 the reference element
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, ElemTopology};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let interp = [
@@ -1041,6 +1051,7 @@ impl Ceed {
     /// # Ok(())
     /// # }
     /// ```
+    #[allow(clippy::too_many_arguments)]
     pub fn basis_Hcurl<'a>(
         &self,
         topo: ElemTopology,
@@ -1074,7 +1085,7 @@ impl Ceed {
     /// * `f`       - Boxed closure to evaluate weak form at quadrature points.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QFunctionInputs, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -1132,7 +1143,7 @@ impl Ceed {
     ///              Jacobian of the qf (or qfunction_none)
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QFunctionOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
diff --git a/rust/libceed/src/operator.rs b/rust/libceed/src/operator.rs
index 91e6b6f4a4..fae468d3c9 100644
--- a/rust/libceed/src/operator.rs
+++ b/rust/libceed/src/operator.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -9,7 +9,13 @@
 //! Ceed QFunction. A Ceed Operator connects Ceed ElemRestrictions,
 //! Ceed Bases, and Ceed QFunctions.
 
-use crate::prelude::*;
+use crate::{
+    basis::{Basis, BasisOpt},
+    elem_restriction::{ElemRestriction, ElemRestrictionOpt},
+    prelude::*,
+    qfunction::QFunctionOpt,
+    vector::{Vector, VectorOpt},
+};
 
 // -----------------------------------------------------------------------------
 // Operator Field context wrapper
@@ -17,6 +23,9 @@ use crate::prelude::*;
 #[derive(Debug)]
 pub struct OperatorField<'a> {
     pub(crate) ptr: bind_ceed::CeedOperatorField,
+    pub(crate) vector: crate::Vector<'a>,
+    pub(crate) elem_restriction: crate::ElemRestriction<'a>,
+    pub(crate) basis: crate::Basis<'a>,
     _lifeline: PhantomData<&'a ()>,
 }
 
@@ -24,17 +33,48 @@ pub struct OperatorField<'a> {
 // Implementations
 // -----------------------------------------------------------------------------
 impl<'a> OperatorField<'a> {
+    pub(crate) unsafe fn from_raw(
+        ptr: bind_ceed::CeedOperatorField,
+        ceed: crate::Ceed,
+    ) -> crate::Result<Self> {
+        let vector = {
+            let mut vector_ptr = std::ptr::null_mut();
+            ceed.check_error(bind_ceed::CeedOperatorFieldGetVector(ptr, &mut vector_ptr))?;
+            crate::Vector::from_raw(vector_ptr)?
+        };
+        let elem_restriction = {
+            let mut elem_restriction_ptr = std::ptr::null_mut();
+            ceed.check_error(bind_ceed::CeedOperatorFieldGetElemRestriction(
+                ptr,
+                &mut elem_restriction_ptr,
+            ))?;
+            crate::ElemRestriction::from_raw(elem_restriction_ptr)?
+        };
+        let basis = {
+            let mut basis_ptr = std::ptr::null_mut();
+            ceed.check_error(bind_ceed::CeedOperatorFieldGetBasis(ptr, &mut basis_ptr))?;
+            crate::Basis::from_raw(basis_ptr)?
+        };
+        Ok(Self {
+            ptr,
+            vector,
+            elem_restriction,
+            basis,
+            _lifeline: PhantomData,
+        })
+    }
+
     /// Get the name of an OperatorField
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -74,14 +114,14 @@ impl<'a> OperatorField<'a> {
     /// Get the ElemRestriction of an OperatorField
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -106,42 +146,54 @@ impl<'a> OperatorField<'a> {
     ///     inputs[0].elem_restriction().is_some(),
     ///     "Incorrect field ElemRestriction"
     /// );
+    /// if let ElemRestrictionOpt::Some(r) = inputs[0].elem_restriction() {
+    ///     assert_eq!(
+    ///         r.num_elements(),
+    ///         ne,
+    ///         "Incorrect field ElemRestriction number of elements"
+    ///     );
+    /// }
+    ///
     /// assert!(
     ///     inputs[1].elem_restriction().is_none(),
     ///     "Incorrect field ElemRestriction"
     /// );
+    ///
+    /// let outputs = op.outputs()?;
+    ///
+    /// assert!(
+    ///     outputs[0].elem_restriction().is_some(),
+    ///     "Incorrect field ElemRestriction"
+    /// );
+    /// if let ElemRestrictionOpt::Some(r) = outputs[0].elem_restriction() {
+    ///     assert_eq!(
+    ///         r.num_elements(),
+    ///         ne,
+    ///         "Incorrect field ElemRestriction number of elements"
+    ///     );
+    /// }
     /// # Ok(())
     /// # }
     /// ```
-    pub fn elem_restriction(&self) -> ElemRestrictionOpt {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedOperatorFieldGetElemRestriction(self.ptr, &mut ptr);
-        }
-        if ptr == unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE } {
+    pub fn elem_restriction(&self) -> ElemRestrictionOpt<'_> {
+        if self.elem_restriction.ptr == unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE } {
             ElemRestrictionOpt::None
         } else {
-            let slice = unsafe {
-                std::slice::from_raw_parts(
-                    &ptr as *const bind_ceed::CeedElemRestriction as *const crate::ElemRestriction,
-                    1 as usize,
-                )
-            };
-            ElemRestrictionOpt::Some(&slice[0])
+            ElemRestrictionOpt::Some(&self.elem_restriction)
         }
     }
 
     /// Get the Basis of an OperatorField
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -163,7 +215,21 @@ impl<'a> OperatorField<'a> {
     /// let inputs = op.inputs()?;
     ///
     /// assert!(inputs[0].basis().is_some(), "Incorrect field Basis");
+    /// if let BasisOpt::Some(b) = inputs[0].basis() {
+    ///     assert_eq!(
+    ///         b.num_quadrature_points(),
+    ///         q,
+    ///         "Incorrect field Basis number of quadrature points"
+    ///     );
+    /// }
     /// assert!(inputs[1].basis().is_some(), "Incorrect field Basis");
+    /// if let BasisOpt::Some(b) = inputs[1].basis() {
+    ///     assert_eq!(
+    ///         b.num_quadrature_points(),
+    ///         q,
+    ///         "Incorrect field Basis number of quadrature points"
+    ///     );
+    /// }
     ///
     /// let outputs = op.outputs()?;
     ///
@@ -171,35 +237,25 @@ impl<'a> OperatorField<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn basis(&self) -> BasisOpt {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedOperatorFieldGetBasis(self.ptr, &mut ptr);
-        }
-        if ptr == unsafe { bind_ceed::CEED_BASIS_NONE } {
+    pub fn basis(&self) -> BasisOpt<'_> {
+        if self.basis.ptr == unsafe { bind_ceed::CEED_BASIS_NONE } {
             BasisOpt::None
         } else {
-            let slice = unsafe {
-                std::slice::from_raw_parts(
-                    &ptr as *const bind_ceed::CeedBasis as *const crate::Basis,
-                    1 as usize,
-                )
-            };
-            BasisOpt::Some(&slice[0])
+            BasisOpt::Some(&self.basis)
         }
     }
 
     /// Get the Vector of an OperatorField
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -222,26 +278,20 @@ impl<'a> OperatorField<'a> {
     ///
     /// assert!(inputs[0].vector().is_active(), "Incorrect field Vector");
     /// assert!(inputs[1].vector().is_none(), "Incorrect field Vector");
+    ///
+    /// let outputs = op.outputs()?;
+    ///
+    /// assert!(outputs[0].vector().is_active(), "Incorrect field Vector");
     /// # Ok(())
     /// # }
     /// ```
-    pub fn vector(&self) -> VectorOpt {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedOperatorFieldGetVector(self.ptr, &mut ptr);
-        }
-        if ptr == unsafe { bind_ceed::CEED_VECTOR_ACTIVE } {
+    pub fn vector(&self) -> VectorOpt<'_> {
+        if self.vector.ptr == unsafe { bind_ceed::CEED_VECTOR_ACTIVE } {
             VectorOpt::Active
-        } else if ptr == unsafe { bind_ceed::CEED_VECTOR_NONE } {
+        } else if self.vector.ptr == unsafe { bind_ceed::CEED_VECTOR_NONE } {
             VectorOpt::None
         } else {
-            let slice = unsafe {
-                std::slice::from_raw_parts(
-                    &ptr as *const bind_ceed::CeedVector as *const crate::Vector,
-                    1 as usize,
-                )
-            };
-            VectorOpt::Some(&slice[0])
+            VectorOpt::Some(&self.vector)
         }
     }
 }
@@ -296,14 +346,14 @@ impl<'a> fmt::Display for OperatorCore<'a> {
 /// View an Operator
 ///
 /// ```
-/// # use libceed::prelude::*;
+/// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
 /// # fn main() -> libceed::Result<()> {
 /// # let ceed = libceed::Ceed::default_init();
 /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
 ///
 /// // Operator field arguments
 /// let ne = 3;
-/// let q = 4 as usize;
+/// let q = 4_usize;
 /// let mut ind: Vec<i32> = vec![0; 2 * ne];
 /// for i in 0..ne {
 ///     ind[2 * i + 0] = i as i32;
@@ -336,13 +386,13 @@ impl<'a> fmt::Display for Operator<'a> {
 /// View a composite Operator
 ///
 /// ```
-/// # use libceed::prelude::*;
+/// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
 /// # fn main() -> libceed::Result<()> {
 /// # let ceed = libceed::Ceed::default_init();
 ///
 /// // Sub operator field arguments
 /// let ne = 3;
-/// let q = 4 as usize;
+/// let q = 4_usize;
 /// let mut ind: Vec<i32> = vec![0; 2 * ne];
 /// for i in 0..ne {
 ///     ind[2 * i + 0] = i as i32;
@@ -393,100 +443,94 @@ impl<'a> fmt::Display for CompositeOperator<'a> {
 // Core functionality
 // -----------------------------------------------------------------------------
 impl<'a> OperatorCore<'a> {
+    // Raw Ceed for error handling
+    #[doc(hidden)]
+    fn ceed(&self) -> bind_ceed::Ceed {
+        unsafe { bind_ceed::CeedOperatorReturnCeed(self.ptr) }
+    }
+
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedOperatorGetCeed(self.ptr, &mut ptr);
-        }
-        crate::check_error(ptr, ierr)
+        crate::check_error(|| self.ceed(), ierr)
     }
 
     // Common implementations
     pub fn check(&self) -> crate::Result<i32> {
-        let ierr = unsafe { bind_ceed::CeedOperatorCheckReady(self.ptr) };
-        self.check_error(ierr)
+        self.check_error(unsafe { bind_ceed::CeedOperatorCheckReady(self.ptr) })
     }
 
     pub fn name(&self, name: &str) -> crate::Result<i32> {
         let name_c = CString::new(name).expect("CString::new failed");
-        let ierr = unsafe { bind_ceed::CeedOperatorSetName(self.ptr, name_c.as_ptr()) };
-        self.check_error(ierr)
+        self.check_error(unsafe { bind_ceed::CeedOperatorSetName(self.ptr, name_c.as_ptr()) })
     }
 
     pub fn apply(&self, input: &Vector, output: &mut Vector) -> crate::Result<i32> {
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedOperatorApply(
                 self.ptr,
                 input.ptr,
                 output.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     pub fn apply_add(&self, input: &Vector, output: &mut Vector) -> crate::Result<i32> {
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedOperatorApplyAdd(
                 self.ptr,
                 input.ptr,
                 output.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     pub fn linear_assemble_diagonal(&self, assembled: &mut Vector) -> crate::Result<i32> {
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedOperatorLinearAssembleDiagonal(
                 self.ptr,
                 assembled.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     pub fn linear_assemble_add_diagonal(&self, assembled: &mut Vector) -> crate::Result<i32> {
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedOperatorLinearAssembleAddDiagonal(
                 self.ptr,
                 assembled.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     pub fn linear_assemble_point_block_diagonal(
         &self,
         assembled: &mut Vector,
     ) -> crate::Result<i32> {
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedOperatorLinearAssemblePointBlockDiagonal(
                 self.ptr,
                 assembled.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     pub fn linear_assemble_add_point_block_diagonal(
         &self,
         assembled: &mut Vector,
     ) -> crate::Result<i32> {
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedOperatorLinearAssembleAddPointBlockDiagonal(
                 self.ptr,
                 assembled.ptr,
                 bind_ceed::CEED_REQUEST_IMMEDIATE,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 }
 
@@ -502,7 +546,7 @@ impl<'a> Operator<'a> {
         dqfT: impl Into<QFunctionOpt<'b>>,
     ) -> crate::Result<Self> {
         let mut ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedOperatorCreate(
                 ceed.ptr,
                 qf.into().to_raw(),
@@ -510,8 +554,7 @@ impl<'a> Operator<'a> {
                 dqfT.into().to_raw(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             op_core: OperatorCore {
                 ptr,
@@ -520,7 +563,7 @@ impl<'a> Operator<'a> {
         })
     }
 
-    fn from_raw(ptr: bind_ceed::CeedOperator) -> crate::Result<Self> {
+    unsafe fn from_raw(ptr: bind_ceed::CeedOperator) -> crate::Result<Self> {
         Ok(Self {
             op_core: OperatorCore {
                 ptr,
@@ -534,14 +577,14 @@ impl<'a> Operator<'a> {
     /// * 'name' - Name to set
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -575,7 +618,7 @@ impl<'a> Operator<'a> {
     /// * `output` - Output Vector
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -653,7 +696,7 @@ impl<'a> Operator<'a> {
     /// * `output` - Output Vector
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -733,7 +776,7 @@ impl<'a> Operator<'a> {
     ///
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -765,8 +808,8 @@ impl<'a> Operator<'a> {
         v: impl Into<VectorOpt<'b>>,
     ) -> crate::Result<Self> {
         let fieldname = CString::new(fieldname).expect("CString::new failed");
-        let fieldname = fieldname.as_ptr() as *const i8;
-        let ierr = unsafe {
+        let fieldname = fieldname.as_ptr();
+        self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorSetField(
                 self.op_core.ptr,
                 fieldname,
@@ -774,22 +817,21 @@ impl<'a> Operator<'a> {
                 b.into().to_raw(),
                 v.into().to_raw(),
             )
-        };
-        self.op_core.check_error(ierr)?;
+        })?;
         Ok(self)
     }
 
     /// Get a slice of Operator inputs
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -814,41 +856,52 @@ impl<'a> Operator<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn inputs(&self) -> crate::Result<&[crate::OperatorField]> {
+    pub fn inputs(&self) -> crate::Result<Vec<crate::OperatorField<'_>>> {
         // Get array of raw C pointers for inputs
         let mut num_inputs = 0;
         let mut inputs_ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorGetFields(
                 self.op_core.ptr,
                 &mut num_inputs,
                 &mut inputs_ptr,
                 std::ptr::null_mut() as *mut bind_ceed::CeedInt,
-                std::ptr::null_mut() as *mut *mut bind_ceed::CeedOperatorField,
+                std::ptr::null_mut(),
             )
-        };
-        self.op_core.check_error(ierr)?;
+        })?;
         // Convert raw C pointers to fixed length slice
         let inputs_slice = unsafe {
             std::slice::from_raw_parts(
-                inputs_ptr as *const crate::OperatorField,
+                inputs_ptr as *mut bind_ceed::CeedOperatorField,
                 num_inputs as usize,
             )
         };
-        Ok(inputs_slice)
+        // And finally build vec
+        let ceed = {
+            let ceed_raw = self.op_core.ceed();
+            let mut ptr = std::ptr::null_mut();
+            unsafe {
+                bind_ceed::CeedReferenceCopy(ceed_raw, &mut ptr); // refcount
+            }
+            crate::Ceed { ptr }
+        };
+        let inputs = (0..num_inputs as usize)
+            .map(|i| unsafe { crate::OperatorField::from_raw(inputs_slice[i], ceed.clone()) })
+            .collect::<crate::Result<Vec<_>>>()?;
+        Ok(inputs)
     }
 
     /// Get a slice of Operator outputs
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
     ///
     /// // Operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -873,34 +926,45 @@ impl<'a> Operator<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn outputs(&self) -> crate::Result<&[crate::OperatorField]> {
+    pub fn outputs(&self) -> crate::Result<Vec<crate::OperatorField<'_>>> {
         // Get array of raw C pointers for outputs
         let mut num_outputs = 0;
         let mut outputs_ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorGetFields(
                 self.op_core.ptr,
                 std::ptr::null_mut() as *mut bind_ceed::CeedInt,
-                std::ptr::null_mut() as *mut *mut bind_ceed::CeedOperatorField,
+                std::ptr::null_mut(),
                 &mut num_outputs,
                 &mut outputs_ptr,
             )
-        };
-        self.op_core.check_error(ierr)?;
+        })?;
         // Convert raw C pointers to fixed length slice
         let outputs_slice = unsafe {
             std::slice::from_raw_parts(
-                outputs_ptr as *const crate::OperatorField,
+                outputs_ptr as *mut bind_ceed::CeedOperatorField,
                 num_outputs as usize,
             )
         };
-        Ok(outputs_slice)
+        // And finally build vec
+        let ceed = {
+            let ceed_raw = self.op_core.ceed();
+            let mut ptr = std::ptr::null_mut();
+            unsafe {
+                bind_ceed::CeedReferenceCopy(ceed_raw, &mut ptr); // refcount
+            }
+            crate::Ceed { ptr }
+        };
+        let outputs = (0..num_outputs as usize)
+            .map(|i| unsafe { crate::OperatorField::from_raw(outputs_slice[i], ceed.clone()) })
+            .collect::<crate::Result<Vec<_>>>()?;
+        Ok(outputs)
     }
 
     /// Check if Operator is setup correctly
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -943,7 +1007,7 @@ impl<'a> Operator<'a> {
     ///
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -980,7 +1044,7 @@ impl<'a> Operator<'a> {
     ///
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let qf = ceed.q_function_interior_by_name("Mass1DBuild")?;
@@ -1027,7 +1091,7 @@ impl<'a> Operator<'a> {
     /// * `assembled` - Vector to store assembled Operator diagonal
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -1134,7 +1198,7 @@ impl<'a> Operator<'a> {
     ///
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -1246,7 +1310,7 @@ impl<'a> Operator<'a> {
     ///                   `[nodes, component out, component in]`.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, EvalMode, MemType, QFunctionInputs, QFunctionOpt, QFunctionOutputs, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -1385,7 +1449,7 @@ impl<'a> Operator<'a> {
     ///                   `[nodes, component out, component in]`.
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, EvalMode, MemType, QFunctionInputs, QFunctionOpt, QFunctionOutputs, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -1517,7 +1581,7 @@ impl<'a> Operator<'a> {
     /// * `basis_coarse` - Coarse grid active vector basis
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 15;
@@ -1625,7 +1689,7 @@ impl<'a> Operator<'a> {
     /// // Check
     /// let sum: Scalar = v_fine.view()?.iter().sum();
     /// assert!(
-    ///     (sum - 2.0).abs() < 50.0 * libceed::EPSILON,
+    ///     (sum - 2.0).abs() < 200.0 * libceed::EPSILON,
     ///     "Incorrect interval length computed"
     /// );
     ///
@@ -1635,7 +1699,7 @@ impl<'a> Operator<'a> {
     /// // Check
     /// let sum: Scalar = v_coarse.view()?.iter().sum();
     /// assert!(
-    ///     (sum - 2.0).abs() < 50.0 * libceed::EPSILON,
+    ///     (sum - 2.0).abs() < 200.0 * libceed::EPSILON,
     ///     "Incorrect interval length computed"
     /// );
     /// # Ok(())
@@ -1650,7 +1714,7 @@ impl<'a> Operator<'a> {
         let mut ptr_coarse = std::ptr::null_mut();
         let mut ptr_prolong = std::ptr::null_mut();
         let mut ptr_restrict = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorMultigridLevelCreate(
                 self.op_core.ptr,
                 p_mult_fine.ptr,
@@ -1660,11 +1724,10 @@ impl<'a> Operator<'a> {
                 &mut ptr_prolong,
                 &mut ptr_restrict,
             )
-        };
-        self.op_core.check_error(ierr)?;
-        let op_coarse = Operator::from_raw(ptr_coarse)?;
-        let op_prolong = Operator::from_raw(ptr_prolong)?;
-        let op_restrict = Operator::from_raw(ptr_restrict)?;
+        })?;
+        let op_coarse = unsafe { Operator::from_raw(ptr_coarse)? };
+        let op_prolong = unsafe { Operator::from_raw(ptr_prolong)? };
+        let op_restrict = unsafe { Operator::from_raw(ptr_restrict)? };
         Ok((op_coarse, op_prolong, op_restrict))
     }
 
@@ -1677,7 +1740,7 @@ impl<'a> Operator<'a> {
     /// * `interp_c_to_f` - Matrix for coarse to fine
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, EvalMode, MemType, QFunctionOpt, QuadMode, Scalar, TransposeMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 15;
@@ -1814,7 +1877,7 @@ impl<'a> Operator<'a> {
     /// // Check
     /// let sum: Scalar = v_fine.view()?.iter().sum();
     /// assert!(
-    ///     (sum - 2.0).abs() < 10.0 * libceed::EPSILON,
+    ///     (sum - 2.0).abs() < 200.0 * libceed::EPSILON,
     ///     "Incorrect interval length computed"
     /// );
     ///
@@ -1824,7 +1887,7 @@ impl<'a> Operator<'a> {
     /// // Check
     /// let sum: Scalar = v_coarse.view()?.iter().sum();
     /// assert!(
-    ///     (sum - 2.0).abs() < 10.0 * libceed::EPSILON,
+    ///     (sum - 2.0).abs() < 200.0 * libceed::EPSILON,
     ///     "Incorrect interval length computed"
     /// );
     /// # Ok(())
@@ -1835,12 +1898,12 @@ impl<'a> Operator<'a> {
         p_mult_fine: &Vector,
         rstr_coarse: &ElemRestriction,
         basis_coarse: &Basis,
-        interpCtoF: &Vec<Scalar>,
+        interpCtoF: &[crate::Scalar],
     ) -> crate::Result<(Operator<'b>, Operator<'b>, Operator<'b>)> {
         let mut ptr_coarse = std::ptr::null_mut();
         let mut ptr_prolong = std::ptr::null_mut();
         let mut ptr_restrict = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorMultigridLevelCreateTensorH1(
                 self.op_core.ptr,
                 p_mult_fine.ptr,
@@ -1851,11 +1914,10 @@ impl<'a> Operator<'a> {
                 &mut ptr_prolong,
                 &mut ptr_restrict,
             )
-        };
-        self.op_core.check_error(ierr)?;
-        let op_coarse = Operator::from_raw(ptr_coarse)?;
-        let op_prolong = Operator::from_raw(ptr_prolong)?;
-        let op_restrict = Operator::from_raw(ptr_restrict)?;
+        })?;
+        let op_coarse = unsafe { Operator::from_raw(ptr_coarse)? };
+        let op_prolong = unsafe { Operator::from_raw(ptr_prolong)? };
+        let op_restrict = unsafe { Operator::from_raw(ptr_restrict)? };
         Ok((op_coarse, op_prolong, op_restrict))
     }
 
@@ -1868,7 +1930,7 @@ impl<'a> Operator<'a> {
     /// * `interp_c_to_f` - Matrix for coarse to fine
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, EvalMode, MemType, QFunctionOpt, QuadMode, Scalar, TransposeMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 15;
@@ -2005,7 +2067,7 @@ impl<'a> Operator<'a> {
     /// // Check
     /// let sum: Scalar = v_fine.view()?.iter().sum();
     /// assert!(
-    ///     (sum - 2.0).abs() < 10.0 * libceed::EPSILON,
+    ///     (sum - 2.0).abs() < 200.0 * libceed::EPSILON,
     ///     "Incorrect interval length computed"
     /// );
     ///
@@ -2015,7 +2077,7 @@ impl<'a> Operator<'a> {
     /// // Check
     /// let sum: Scalar = v_coarse.view()?.iter().sum();
     /// assert!(
-    ///     (sum - 2.0).abs() < 10.0 * libceed::EPSILON,
+    ///     (sum - 2.0).abs() < 200.0 * libceed::EPSILON,
     ///     "Incorrect interval length computed"
     /// );
     /// # Ok(())
@@ -2026,12 +2088,12 @@ impl<'a> Operator<'a> {
         p_mult_fine: &Vector,
         rstr_coarse: &ElemRestriction,
         basis_coarse: &Basis,
-        interpCtoF: &[Scalar],
+        interpCtoF: &[crate::Scalar],
     ) -> crate::Result<(Operator<'b>, Operator<'b>, Operator<'b>)> {
         let mut ptr_coarse = std::ptr::null_mut();
         let mut ptr_prolong = std::ptr::null_mut();
         let mut ptr_restrict = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.op_core.check_error(unsafe {
             bind_ceed::CeedOperatorMultigridLevelCreateH1(
                 self.op_core.ptr,
                 p_mult_fine.ptr,
@@ -2042,11 +2104,10 @@ impl<'a> Operator<'a> {
                 &mut ptr_prolong,
                 &mut ptr_restrict,
             )
-        };
-        self.op_core.check_error(ierr)?;
-        let op_coarse = Operator::from_raw(ptr_coarse)?;
-        let op_prolong = Operator::from_raw(ptr_prolong)?;
-        let op_restrict = Operator::from_raw(ptr_restrict)?;
+        })?;
+        let op_coarse = unsafe { Operator::from_raw(ptr_coarse)? };
+        let op_prolong = unsafe { Operator::from_raw(ptr_prolong)? };
+        let op_restrict = unsafe { Operator::from_raw(ptr_restrict)? };
         Ok((op_coarse, op_prolong, op_restrict))
     }
 }
@@ -2058,8 +2119,7 @@ impl<'a> CompositeOperator<'a> {
     // Constructor
     pub fn create(ceed: &crate::Ceed) -> crate::Result<Self> {
         let mut ptr = std::ptr::null_mut();
-        let ierr = unsafe { bind_ceed::CeedCompositeOperatorCreate(ceed.ptr, &mut ptr) };
-        ceed.check_error(ierr)?;
+        ceed.check_error(unsafe { bind_ceed::CeedOperatorCreateComposite(ceed.ptr, &mut ptr) })?;
         Ok(Self {
             op_core: OperatorCore {
                 ptr,
@@ -2073,13 +2133,13 @@ impl<'a> CompositeOperator<'a> {
     /// * 'name' - Name to set
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     ///
     /// // Sub operator field arguments
     /// let ne = 3;
-    /// let q = 4 as usize;
+    /// let q = 4_usize;
     /// let mut ind: Vec<i32> = vec![0; 2 * ne];
     /// for i in 0..ne {
     ///     ind[2 * i + 0] = i as i32;
@@ -2130,7 +2190,7 @@ impl<'a> CompositeOperator<'a> {
     /// * `output` - Output Vector
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -2227,7 +2287,7 @@ impl<'a> CompositeOperator<'a> {
     /// * `output` - Output Vector
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, Scalar, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
@@ -2323,7 +2383,7 @@ impl<'a> CompositeOperator<'a> {
     /// * `subop` - Sub-Operator
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, QFunctionOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut op = ceed.composite_operator()?;
@@ -2340,16 +2400,16 @@ impl<'a> CompositeOperator<'a> {
     /// ```
     #[allow(unused_mut)]
     pub fn sub_operator(mut self, subop: &Operator) -> crate::Result<Self> {
-        let ierr =
-            unsafe { bind_ceed::CeedCompositeOperatorAddSub(self.op_core.ptr, subop.op_core.ptr) };
-        self.op_core.check_error(ierr)?;
+        self.op_core.check_error(unsafe {
+            bind_ceed::CeedOperatorCompositeAddSub(self.op_core.ptr, subop.op_core.ptr)
+        })?;
         Ok(self)
     }
 
     /// Check if CompositeOperator is setup correctly
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, BasisOpt, ElemRestrictionOpt, MemType, QFunctionOpt, QuadMode, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let ne = 4;
diff --git a/rust/libceed/src/qfunction.rs b/rust/libceed/src/qfunction.rs
index 0d32d01d28..f1eb5786f9 100644
--- a/rust/libceed/src/qfunction.rs
+++ b/rust/libceed/src/qfunction.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -10,7 +10,7 @@
 
 use std::pin::Pin;
 
-use crate::prelude::*;
+use crate::{prelude::*, vector::Vector, MAX_QFUNCTION_FIELDS};
 
 pub type QFunctionInputs<'a> = [&'a [crate::Scalar]; MAX_QFUNCTION_FIELDS];
 pub type QFunctionOutputs<'a> = [&'a mut [crate::Scalar]; MAX_QFUNCTION_FIELDS];
@@ -82,7 +82,7 @@ impl<'a> QFunctionField<'a> {
     /// Get the evaluation mode of a QFunctionField
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// const Q: usize = 8;
@@ -108,7 +108,7 @@ impl<'a> QFunctionField<'a> {
         unsafe {
             bind_ceed::CeedQFunctionFieldGetEvalMode(self.ptr, &mut mode);
         }
-        crate::EvalMode::from_u32(mode as u32)
+        crate::EvalMode::from_u32(mode)
     }
 }
 
@@ -139,7 +139,7 @@ impl<'a> From<&'a QFunctionByName<'_>> for QFunctionOpt<'a> {
 
 impl<'a> QFunctionOpt<'a> {
     /// Transform a Rust libCEED QFunctionOpt into C libCEED CeedQFunction
-    pub(crate) fn to_raw(self) -> bind_ceed::CeedQFunction {
+    pub(crate) fn to_raw(&self) -> bind_ceed::CeedQFunction {
         match self {
             Self::SomeQFunction(qfunc) => qfunc.qf_core.ptr,
             Self::SomeQFunctionByName(qfunc) => qfunc.qf_core.ptr,
@@ -150,7 +150,7 @@ impl<'a> QFunctionOpt<'a> {
     /// Check if a QFunctionOpt is Some
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOpt, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -191,7 +191,7 @@ impl<'a> QFunctionOpt<'a> {
     /// Check if a QFunctionOpt is SomeQFunction
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOpt, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -232,7 +232,7 @@ impl<'a> QFunctionOpt<'a> {
     /// Check if a QFunctionOpt is SomeQFunctionByName
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOpt, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -282,7 +282,7 @@ impl<'a> QFunctionOpt<'a> {
     /// Check if a QFunctionOpt is None
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOpt, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -389,7 +389,7 @@ impl<'a> fmt::Display for QFunctionCore<'a> {
 /// View a QFunction
 ///
 /// ```
-/// # use libceed::prelude::*;
+/// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs};
 /// # fn main() -> libceed::Result<()> {
 /// # let ceed = libceed::Ceed::default_init();
 /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -439,14 +439,16 @@ impl<'a> fmt::Display for QFunctionByName<'a> {
 // Core functionality
 // -----------------------------------------------------------------------------
 impl<'a> QFunctionCore<'a> {
+    // Raw Ceed for error handling
+    #[doc(hidden)]
+    fn ceed(&self) -> bind_ceed::Ceed {
+        unsafe { bind_ceed::CeedQFunctionReturnCeed(self.ptr) }
+    }
+
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedQFunctionGetCeed(self.ptr, &mut ptr);
-        }
-        crate::check_error(ptr, ierr)
+        crate::check_error(|| self.ceed(), ierr)
     }
 
     // Common implementation
@@ -460,56 +462,47 @@ impl<'a> QFunctionCore<'a> {
             v_c[i] = v[i].ptr;
         }
         let Q = i32::try_from(Q).unwrap();
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedQFunctionApply(self.ptr, Q, u_c.as_mut_ptr(), v_c.as_mut_ptr())
-        };
-        self.check_error(ierr)
+        })
     }
 
-    pub fn inputs(&self) -> crate::Result<&[crate::QFunctionField]> {
+    pub fn inputs(&self) -> crate::Result<&[QFunctionField<'_>]> {
         // Get array of raw C pointers for inputs
         let mut num_inputs = 0;
         let mut inputs_ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedQFunctionGetFields(
                 self.ptr,
                 &mut num_inputs,
                 &mut inputs_ptr,
                 std::ptr::null_mut() as *mut bind_ceed::CeedInt,
-                std::ptr::null_mut() as *mut *mut bind_ceed::CeedQFunctionField,
+                std::ptr::null_mut(),
             )
-        };
-        self.check_error(ierr)?;
+        })?;
         // Convert raw C pointers to fixed length slice
         let inputs_slice = unsafe {
-            std::slice::from_raw_parts(
-                inputs_ptr as *const crate::QFunctionField,
-                num_inputs as usize,
-            )
+            std::slice::from_raw_parts(inputs_ptr as *const QFunctionField, num_inputs as usize)
         };
         Ok(inputs_slice)
     }
 
-    pub fn outputs(&self) -> crate::Result<&[crate::QFunctionField]> {
+    pub fn outputs(&self) -> crate::Result<&[QFunctionField<'_>]> {
         // Get array of raw C pointers for outputs
         let mut num_outputs = 0;
         let mut outputs_ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedQFunctionGetFields(
                 self.ptr,
                 std::ptr::null_mut() as *mut bind_ceed::CeedInt,
-                std::ptr::null_mut() as *mut *mut bind_ceed::CeedQFunctionField,
+                std::ptr::null_mut(),
                 &mut num_outputs,
                 &mut outputs_ptr,
             )
-        };
-        self.check_error(ierr)?;
+        })?;
         // Convert raw C pointers to fixed length slice
         let outputs_slice = unsafe {
-            std::slice::from_raw_parts(
-                outputs_ptr as *const crate::QFunctionField,
-                num_outputs as usize,
-            )
+            std::slice::from_raw_parts(outputs_ptr as *const QFunctionField, num_outputs as usize)
         };
         Ok(outputs_slice)
     }
@@ -573,12 +566,6 @@ unsafe extern "C" fn trampoline(
     (trampoline_data.get_unchecked_mut().user_f)(inputs_array, outputs_array)
 }
 
-unsafe extern "C" fn destroy_trampoline(ctx: *mut ::std::os::raw::c_void) -> ::std::os::raw::c_int {
-    let trampoline_data: Pin<&mut QFunctionTrampolineData> = std::mem::transmute(ctx);
-    drop(trampoline_data);
-    0 // Clean error code
-}
-
 // -----------------------------------------------------------------------------
 // QFunction
 // -----------------------------------------------------------------------------
@@ -609,7 +596,7 @@ impl<'a> QFunction<'a> {
 
         // Create QFunction
         let vlength = i32::try_from(vlength).unwrap();
-        let mut ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedQFunctionCreateInterior(
                 ceed.ptr,
                 vlength,
@@ -617,33 +604,27 @@ impl<'a> QFunction<'a> {
                 source_c.as_ptr(),
                 &mut ptr,
             )
-        };
-        ceed.check_error(ierr)?;
+        })?;
 
         // Set closure
         let mut qf_ctx_ptr = std::ptr::null_mut();
-        ierr = unsafe { bind_ceed::CeedQFunctionContextCreate(ceed.ptr, &mut qf_ctx_ptr) };
-        ceed.check_error(ierr)?;
-        ierr = unsafe {
+        ceed.check_error(unsafe {
+            bind_ceed::CeedQFunctionContextCreate(ceed.ptr, &mut qf_ctx_ptr)
+        })?;
+        ceed.check_error(unsafe {
             bind_ceed::CeedQFunctionContextSetData(
                 qf_ctx_ptr,
                 crate::MemType::Host as bind_ceed::CeedMemType,
                 crate::CopyMode::UsePointer as bind_ceed::CeedCopyMode,
                 std::mem::size_of::<QFunctionTrampolineData>(),
-                std::mem::transmute(trampoline_data.as_ref()),
+                std::mem::transmute::<
+                    std::pin::Pin<&QFunctionTrampolineData>,
+                    *mut std::ffi::c_void,
+                >(trampoline_data.as_ref()),
             )
-        };
-        ceed.check_error(ierr)?;
-        ierr = unsafe {
-            bind_ceed::CeedQFunctionContextSetDataDestroy(
-                qf_ctx_ptr,
-                crate::MemType::Host as bind_ceed::CeedMemType,
-                Some(destroy_trampoline),
-            )
-        };
-        ceed.check_error(ierr)?;
-        ierr = unsafe { bind_ceed::CeedQFunctionSetContext(ptr, qf_ctx_ptr) };
-        ceed.check_error(ierr)?;
+        })?;
+        ceed.check_error(unsafe { bind_ceed::CeedQFunctionSetContext(ptr, qf_ctx_ptr) })?;
+        ceed.check_error(unsafe { bind_ceed::CeedQFunctionContextDestroy(&mut qf_ctx_ptr) })?;
         Ok(Self {
             qf_core: QFunctionCore {
                 ptr,
@@ -661,7 +642,7 @@ impl<'a> QFunction<'a> {
     /// * `output` - Array of output Vectors
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -729,7 +710,7 @@ impl<'a> QFunction<'a> {
     ///                   gradients, `EvalMode::Weight` to use quadrature weights
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -763,10 +744,9 @@ impl<'a> QFunction<'a> {
             i32::try_from(size).unwrap(),
             emode as bind_ceed::CeedEvalMode,
         );
-        let ierr = unsafe {
+        self.qf_core.check_error(unsafe {
             bind_ceed::CeedQFunctionAddInput(self.qf_core.ptr, name_c.as_ptr(), size, emode)
-        };
-        self.qf_core.check_error(ierr)?;
+        })?;
         Ok(self)
     }
 
@@ -780,7 +760,7 @@ impl<'a> QFunction<'a> {
     ///                   gradients
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -813,17 +793,16 @@ impl<'a> QFunction<'a> {
             i32::try_from(size).unwrap(),
             emode as bind_ceed::CeedEvalMode,
         );
-        let ierr = unsafe {
+        self.qf_core.check_error(unsafe {
             bind_ceed::CeedQFunctionAddOutput(self.qf_core.ptr, name_c.as_ptr(), size, emode)
-        };
-        self.qf_core.check_error(ierr)?;
+        })?;
         Ok(self)
     }
 
     /// Get a slice of QFunction inputs
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -847,14 +826,14 @@ impl<'a> QFunction<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn inputs(&self) -> crate::Result<&[crate::QFunctionField]> {
+    pub fn inputs(&self) -> crate::Result<&[QFunctionField<'_>]> {
         self.qf_core.inputs()
     }
 
     /// Get a slice of QFunction outputs
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut user_f = |[u, weights, ..]: QFunctionInputs, [v, ..]: QFunctionOutputs| {
@@ -877,7 +856,7 @@ impl<'a> QFunction<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn outputs(&self) -> crate::Result<&[crate::QFunctionField]> {
+    pub fn outputs(&self) -> crate::Result<&[QFunctionField<'_>]> {
         self.qf_core.outputs()
     }
 }
@@ -890,10 +869,9 @@ impl<'a> QFunctionByName<'a> {
     pub fn create(ceed: &crate::Ceed, name: &str) -> crate::Result<Self> {
         let name_c = CString::new(name).expect("CString::new failed");
         let mut ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        ceed.check_error(unsafe {
             bind_ceed::CeedQFunctionCreateInteriorByName(ceed.ptr, name_c.as_ptr(), &mut ptr)
-        };
-        ceed.check_error(ierr)?;
+        })?;
         Ok(Self {
             qf_core: QFunctionCore {
                 ptr,
@@ -909,7 +887,7 @@ impl<'a> QFunctionByName<'a> {
     /// * `output` - Array of output Vectors
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, EvalMode, QFunctionInputs, QFunctionOutputs, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// const Q: usize = 8;
@@ -982,7 +960,7 @@ impl<'a> QFunctionByName<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn inputs(&self) -> crate::Result<&[crate::QFunctionField]> {
+    pub fn inputs(&self) -> crate::Result<&[QFunctionField<'_>]> {
         self.qf_core.inputs()
     }
 
@@ -1001,7 +979,7 @@ impl<'a> QFunctionByName<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn outputs(&self) -> crate::Result<&[crate::QFunctionField]> {
+    pub fn outputs(&self) -> crate::Result<&[QFunctionField<'_>]> {
         self.qf_core.outputs()
     }
 }
diff --git a/rust/libceed/src/vector.rs b/rust/libceed/src/vector.rs
index c90d8a295a..a1f9cd5178 100644
--- a/rust/libceed/src/vector.rs
+++ b/rust/libceed/src/vector.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -34,7 +34,7 @@ impl<'a> From<&'a Vector<'_>> for VectorOpt<'a> {
 }
 impl<'a> VectorOpt<'a> {
     /// Transform a Rust libCEED VectorOpt into C libCEED CeedVector
-    pub(crate) fn to_raw(self) -> bind_ceed::CeedVector {
+    pub(crate) fn to_raw(&self) -> bind_ceed::CeedVector {
         match self {
             Self::Some(vec) => vec.ptr,
             Self::Active => unsafe { bind_ceed::CEED_VECTOR_ACTIVE },
@@ -45,7 +45,7 @@ impl<'a> VectorOpt<'a> {
     /// Check if a VectorOpt is Some
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let vec = libceed::vector::Vector::from_slice(&ceed, &[1., 2., 3.])?;
@@ -71,7 +71,7 @@ impl<'a> VectorOpt<'a> {
     /// Check if a VectorOpt is Active
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let vec = libceed::vector::Vector::from_slice(&ceed, &[1., 2., 3.])?;
@@ -97,7 +97,7 @@ impl<'a> VectorOpt<'a> {
     /// Check if a VectorOpt is Some
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, VectorOpt};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let vec = libceed::vector::Vector::from_slice(&ceed, &[1., 2., 3.])?;
@@ -125,7 +125,7 @@ impl<'a> VectorOpt<'a> {
 // Vector borrowed slice wrapper
 // -----------------------------------------------------------------------------
 pub struct VectorSliceWrapper<'a> {
-    pub(crate) vector: crate::Vector<'a>,
+    pub(crate) vector: Vector<'a>,
     pub(crate) _slice: &'a mut [crate::Scalar],
 }
 
@@ -149,7 +149,7 @@ impl<'a> Drop for VectorSliceWrapper<'a> {
 // -----------------------------------------------------------------------------
 impl<'a> VectorSliceWrapper<'a> {
     fn from_vector_and_slice_mut<'b>(
-        vec: &'b mut crate::Vector,
+        vec: &'b mut Vector,
         slice: &'a mut [crate::Scalar],
     ) -> crate::Result<Self> {
         assert_eq!(vec.length(), slice.len());
@@ -157,18 +157,16 @@ impl<'a> VectorSliceWrapper<'a> {
             crate::MemType::Host as bind_ceed::CeedMemType,
             crate::CopyMode::UsePointer as bind_ceed::CeedCopyMode,
         );
-        let ierr = unsafe {
+        vec.check_error(unsafe {
             bind_ceed::CeedVectorSetArray(
                 vec.ptr,
                 host,
                 copy_mode,
                 slice.as_ptr() as *mut crate::Scalar,
             )
-        };
-        vec.check_error(ierr)?;
-
+        })?;
         Ok(Self {
-            vector: crate::Vector::from_raw(vec.ptr_copy_mut()?)?,
+            vector: unsafe { Vector::from_raw(vec.ptr_copy_mut()?)? },
             _slice: slice,
         })
     }
@@ -247,15 +245,14 @@ impl<'a> Vector<'a> {
     pub fn create(ceed: &crate::Ceed, n: usize) -> crate::Result<Self> {
         let n = isize::try_from(n).unwrap();
         let mut ptr = std::ptr::null_mut();
-        let ierr = unsafe { bind_ceed::CeedVectorCreate(ceed.ptr, n, &mut ptr) };
-        ceed.check_error(ierr)?;
+        ceed.check_error(unsafe { bind_ceed::CeedVectorCreate(ceed.ptr, n, &mut ptr) })?;
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
         })
     }
 
-    pub(crate) fn from_raw(ptr: bind_ceed::CeedVector) -> crate::Result<Self> {
+    pub(crate) unsafe fn from_raw(ptr: bind_ceed::CeedVector) -> crate::Result<Self> {
         Ok(Self {
             ptr,
             _lifeline: PhantomData,
@@ -264,8 +261,7 @@ impl<'a> Vector<'a> {
 
     fn ptr_copy_mut(&mut self) -> crate::Result<bind_ceed::CeedVector> {
         let mut ptr_copy = std::ptr::null_mut();
-        let ierr = unsafe { bind_ceed::CeedVectorReferenceCopy(self.ptr, &mut ptr_copy) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe { bind_ceed::CeedVectorReferenceCopy(self.ptr, &mut ptr_copy) })?;
         Ok(ptr_copy)
     }
 
@@ -276,7 +272,7 @@ impl<'a> Vector<'a> {
     /// * `vec_source` - vector to copy array values from
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let a = ceed.vector_from_slice(&[1., 2., 3.])?;
@@ -290,9 +286,8 @@ impl<'a> Vector<'a> {
     /// # }
     /// ```
     /// ```
-    pub fn copy_from(&mut self, vec_source: &crate::Vector) -> crate::Result<i32> {
-        let ierr = unsafe { bind_ceed::CeedVectorCopy(vec_source.ptr, self.ptr) };
-        self.check_error(ierr)
+    pub fn copy_from(&mut self, vec_source: &Vector) -> crate::Result<i32> {
+        self.check_error(unsafe { bind_ceed::CeedVectorCopy(vec_source.ptr, self.ptr) })
     }
 
     /// Create a Vector from a slice
@@ -305,7 +300,7 @@ impl<'a> Vector<'a> {
     /// # use libceed::prelude::*;
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
-    /// let vec = vector::Vector::from_slice(&ceed, &[1., 2., 3.])?;
+    /// let vec = libceed::Vector::from_slice(&ceed, &[1., 2., 3.])?;
     /// assert_eq!(vec.length(), 3, "Incorrect length from slice");
     /// # Ok(())
     /// # }
@@ -340,19 +335,20 @@ impl<'a> Vector<'a> {
             crate::CopyMode::UsePointer as bind_ceed::CeedCopyMode,
         );
         let v = v.as_ptr() as *mut crate::Scalar;
-        let ierr = unsafe { bind_ceed::CeedVectorSetArray(x.ptr, host, user_pointer, v) };
-        ceed.check_error(ierr)?;
+        ceed.check_error(unsafe { bind_ceed::CeedVectorSetArray(x.ptr, host, user_pointer, v) })?;
         Ok(x)
     }
 
+    // Raw Ceed for error handling
+    #[doc(hidden)]
+    fn ceed(&self) -> bind_ceed::Ceed {
+        unsafe { bind_ceed::CeedVectorReturnCeed(self.ptr) }
+    }
+
     // Error handling
     #[doc(hidden)]
     fn check_error(&self, ierr: i32) -> crate::Result<i32> {
-        let mut ptr = std::ptr::null_mut();
-        unsafe {
-            bind_ceed::CeedVectorGetCeed(self.ptr, &mut ptr);
-        }
-        crate::check_error(ptr, ierr)
+        crate::check_error(|| self.ceed(), ierr)
     }
 
     /// Returns the length of a Vector
@@ -389,6 +385,23 @@ impl<'a> Vector<'a> {
         self.length()
     }
 
+    /// Returns true if the Vector contains no elements
+    ///
+    /// ```
+    /// # use libceed::prelude::*;
+    /// # fn main() -> libceed::Result<()> {
+    /// # let ceed = libceed::Ceed::default_init();
+    /// let vec = ceed.vector(10)?;
+    /// assert!(!vec.is_empty(), "Incorrect emptiness");
+    /// let empty_vec = ceed.vector(0)?;
+    /// assert!(empty_vec.is_empty(), "Incorrect emptiness");
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn is_empty(&self) -> bool {
+        self.length() == 0
+    }
+
     /// Set the Vector to a constant value
     ///
     /// # arguments
@@ -412,8 +425,7 @@ impl<'a> Vector<'a> {
     /// # }
     /// ```
     pub fn set_value(&mut self, value: crate::Scalar) -> crate::Result<i32> {
-        let ierr = unsafe { bind_ceed::CeedVectorSetValue(self.ptr, value) };
-        self.check_error(ierr)
+        self.check_error(unsafe { bind_ceed::CeedVectorSetValue(self.ptr, value) })
     }
 
     /// Set values from a slice of the same length
@@ -423,7 +435,7 @@ impl<'a> Vector<'a> {
     /// * `slice` - values to into self; length must match
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut vec = ceed.vector(4)?;
@@ -441,15 +453,14 @@ impl<'a> Vector<'a> {
             crate::MemType::Host as bind_ceed::CeedMemType,
             crate::CopyMode::CopyValues as bind_ceed::CeedCopyMode,
         );
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedVectorSetArray(
                 self.ptr,
                 host,
                 copy_mode,
                 slice.as_ptr() as *mut crate::Scalar,
             )
-        };
-        self.check_error(ierr)
+        })
     }
 
     /// Wrap a mutable slice in a Vector of the same length
@@ -459,7 +470,7 @@ impl<'a> Vector<'a> {
     /// * `slice` - values to wrap in self; length must match
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut vec = ceed.vector(4)?;
@@ -501,7 +512,7 @@ impl<'a> Vector<'a> {
         &mut self,
         slice: &'b mut [crate::Scalar],
     ) -> crate::Result<VectorSliceWrapper<'b>> {
-        crate::VectorSliceWrapper::from_vector_and_slice_mut(self, slice)
+        VectorSliceWrapper::from_vector_and_slice_mut(self, slice)
     }
 
     /// Sync the Vector to a specified memtype
@@ -511,7 +522,7 @@ impl<'a> Vector<'a> {
     /// * `mtype` - Memtype to be synced
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, MemType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let len = 10;
@@ -528,9 +539,9 @@ impl<'a> Vector<'a> {
     /// # }
     /// ```
     pub fn sync(&self, mtype: crate::MemType) -> crate::Result<i32> {
-        let ierr =
-            unsafe { bind_ceed::CeedVectorSyncArray(self.ptr, mtype as bind_ceed::CeedMemType) };
-        self.check_error(ierr)
+        self.check_error(unsafe {
+            bind_ceed::CeedVectorSyncArray(self.ptr, mtype as bind_ceed::CeedMemType)
+        })
     }
 
     /// Create an immutable view
@@ -550,7 +561,7 @@ impl<'a> Vector<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn view(&self) -> crate::Result<VectorView> {
+    pub fn view(&self) -> crate::Result<VectorView<'_>> {
         VectorView::new(self)
     }
 
@@ -572,7 +583,7 @@ impl<'a> Vector<'a> {
     /// # Ok(())
     /// # }
     /// ```
-    pub fn view_mut(&mut self) -> crate::Result<VectorViewMut> {
+    pub fn view_mut(&mut self) -> crate::Result<VectorViewMut<'_>> {
         VectorViewMut::new(self)
     }
 
@@ -583,7 +594,7 @@ impl<'a> Vector<'a> {
     /// * `ntype` - Norm type One, Two, or Max
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, NormType};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let vec = ceed.vector_from_slice(&[1., 2., 3., 4.])?;
@@ -601,10 +612,9 @@ impl<'a> Vector<'a> {
     /// ```
     pub fn norm(&self, ntype: crate::NormType) -> crate::Result<crate::Scalar> {
         let mut res: crate::Scalar = 0.0;
-        let ierr = unsafe {
+        self.check_error(unsafe {
             bind_ceed::CeedVectorNorm(self.ptr, ntype as bind_ceed::CeedNormType, &mut res)
-        };
-        self.check_error(ierr)?;
+        })?;
         Ok(res)
     }
 
@@ -615,7 +625,7 @@ impl<'a> Vector<'a> {
     /// * `alpha` - scaling factor
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut vec = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?;
@@ -629,8 +639,7 @@ impl<'a> Vector<'a> {
     /// ```
     #[allow(unused_mut)]
     pub fn scale(mut self, alpha: crate::Scalar) -> crate::Result<Self> {
-        let ierr = unsafe { bind_ceed::CeedVectorScale(self.ptr, alpha) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe { bind_ceed::CeedVectorScale(self.ptr, alpha) })?;
         Ok(self)
     }
 
@@ -642,7 +651,7 @@ impl<'a> Vector<'a> {
     /// * `x`     - second vector, must be different than self
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let x = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?;
@@ -656,9 +665,8 @@ impl<'a> Vector<'a> {
     /// # }
     /// ```
     #[allow(unused_mut)]
-    pub fn axpy(mut self, alpha: crate::Scalar, x: &crate::Vector) -> crate::Result<Self> {
-        let ierr = unsafe { bind_ceed::CeedVectorAXPY(self.ptr, alpha, x.ptr) };
-        self.check_error(ierr)?;
+    pub fn axpy(mut self, alpha: crate::Scalar, x: &Vector) -> crate::Result<Self> {
+        self.check_error(unsafe { bind_ceed::CeedVectorAXPY(self.ptr, alpha, x.ptr) })?;
         Ok(self)
     }
 
@@ -671,7 +679,7 @@ impl<'a> Vector<'a> {
     /// * `x`     - second vector, must be different than self
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let x = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?;
@@ -689,10 +697,9 @@ impl<'a> Vector<'a> {
         mut self,
         alpha: crate::Scalar,
         beta: crate::Scalar,
-        x: &crate::Vector,
+        x: &Vector,
     ) -> crate::Result<Self> {
-        let ierr = unsafe { bind_ceed::CeedVectorAXPBY(self.ptr, alpha, beta, x.ptr) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe { bind_ceed::CeedVectorAXPBY(self.ptr, alpha, beta, x.ptr) })?;
         Ok(self)
     }
 
@@ -704,7 +711,7 @@ impl<'a> Vector<'a> {
     /// * `y` - second vector for product
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut w = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?;
@@ -719,9 +726,8 @@ impl<'a> Vector<'a> {
     /// # }
     /// ```
     #[allow(unused_mut)]
-    pub fn pointwise_mult(mut self, x: &crate::Vector, y: &crate::Vector) -> crate::Result<Self> {
-        let ierr = unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, x.ptr, y.ptr) };
-        self.check_error(ierr)?;
+    pub fn pointwise_mult(mut self, x: &Vector, y: &Vector) -> crate::Result<Self> {
+        self.check_error(unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, x.ptr, y.ptr) })?;
         Ok(self)
     }
 
@@ -732,7 +738,7 @@ impl<'a> Vector<'a> {
     /// * `x` - second vector for product
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut w = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?;
@@ -746,16 +752,15 @@ impl<'a> Vector<'a> {
     /// # }
     /// ```
     #[allow(unused_mut)]
-    pub fn pointwise_scale(mut self, x: &crate::Vector) -> crate::Result<Self> {
-        let ierr = unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, x.ptr) };
-        self.check_error(ierr)?;
+    pub fn pointwise_scale(mut self, x: &Vector) -> crate::Result<Self> {
+        self.check_error(unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, x.ptr) })?;
         Ok(self)
     }
 
     /// Compute the pointwise multiplication w = w .* w for a Vector
     ///
     /// ```
-    /// # use libceed::prelude::*;
+    /// # use libceed::{prelude::*, Scalar};
     /// # fn main() -> libceed::Result<()> {
     /// # let ceed = libceed::Ceed::default_init();
     /// let mut w = ceed.vector_from_slice(&[0., 1., 2., 3., 4.])?;
@@ -769,8 +774,9 @@ impl<'a> Vector<'a> {
     /// ```
     #[allow(unused_mut)]
     pub fn pointwise_square(mut self) -> crate::Result<Self> {
-        let ierr = unsafe { bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, self.ptr) };
-        self.check_error(ierr)?;
+        self.check_error(unsafe {
+            bind_ceed::CeedVectorPointwiseMult(self.ptr, self.ptr, self.ptr)
+        })?;
         Ok(self)
     }
 }
@@ -791,18 +797,14 @@ impl<'a> VectorView<'a> {
     /// Construct a VectorView from a Vector reference
     fn new(vec: &'a Vector) -> crate::Result<Self> {
         let mut array = std::ptr::null();
-        let ierr = unsafe {
+        vec.check_error(unsafe {
             bind_ceed::CeedVectorGetArrayRead(
                 vec.ptr,
                 crate::MemType::Host as bind_ceed::CeedMemType,
                 &mut array,
             )
-        };
-        vec.check_error(ierr)?;
-        Ok(Self {
-            vec: vec,
-            array: array,
-        })
+        })?;
+        Ok(Self { vec, array })
     }
 }
 
@@ -843,19 +845,15 @@ pub struct VectorViewMut<'a> {
 impl<'a> VectorViewMut<'a> {
     /// Construct a VectorViewMut from a Vector reference
     fn new(vec: &'a mut Vector) -> crate::Result<Self> {
-        let mut ptr = std::ptr::null_mut();
-        let ierr = unsafe {
+        let mut array = std::ptr::null_mut();
+        vec.check_error(unsafe {
             bind_ceed::CeedVectorGetArray(
                 vec.ptr,
                 crate::MemType::Host as bind_ceed::CeedMemType,
-                &mut ptr,
+                &mut array,
             )
-        };
-        vec.check_error(ierr)?;
-        Ok(Self {
-            vec: vec,
-            array: ptr,
-        })
+        })?;
+        Ok(Self { vec, array })
     }
 }
 
diff --git a/rust/libceed/tests/version-numbers.rs b/rust/libceed/tests/version-numbers.rs
index 5f276eae0b..c0f189e2e1 100644
--- a/rust/libceed/tests/version-numbers.rs
+++ b/rust/libceed/tests/version-numbers.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors
 // All Rights Reserved. See the top-level COPYRIGHT and NOTICE files for details.
 //
 // SPDX-License-Identifier: (BSD-2-Clause)
diff --git a/setup.py b/setup.py
index a2bb813e59..b0d423c815 100644
--- a/setup.py
+++ b/setup.py
@@ -79,7 +79,6 @@ def make_libceed_so(self, prefix):
 Development Status :: 4 - Beta
 Intended Audience :: Developers
 Intended Audience :: Science/Research
-License :: OSI Approved :: BSD License
 Operating System :: POSIX
 Programming Language :: C
 Programming Language :: C++
diff --git a/tests/README.md b/tests/README.md
index fd6e426420..031ff5a030 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -15,7 +15,8 @@ The tests are organized by API object, and some tests are further organized, as
     2. CeedBasis simplex basis tests\
     3. CeedBasis non-tensor H(div) basis tests\
     4. CeedBasis non-tensor H(curl) basis tests\
-    5. CeedBasis evaluation at arbitrary points tests
+    5. CeedBasis evaluation at arbitrary points tests\
+    6. CeedBasis ApplyAdd tests
 4. CeedQFunction Tests\
     0. CeedQFunction user code tests\
     1. CeedQFunction gallery code tests
diff --git a/tests/junit.py b/tests/junit.py
index 041582c172..6ea4bcb0b6 100755
--- a/tests/junit.py
+++ b/tests/junit.py
@@ -24,10 +24,12 @@ def create_argparser() -> argparse.ArgumentParser:
         help='Output mode, junit or tap',
         default=RunMode.JUNIT)
     parser.add_argument('-n', '--nproc', type=int, default=1, help='number of MPI processes')
-    parser.add_argument('-o', '--output', type=Optional[Path], default=None, help='Output file to write test')
     parser.add_argument('-b', '--junit-batch', type=str, default='', help='Name of JUnit batch for output file')
     parser.add_argument('-np', '--pool-size', type=int, default=1, help='Number of test cases to run in parallel')
-    parser.add_argument('-s', '--smartredis_dir', type=str, default='', help='path to SmartSim library, if present')
+    parser.add_argument('-s', '--search', type=str, default='.*',
+                        help='Search string to filter tests, using `re` package format')
+    parser.add_argument('-v', '--verbose', action='store_true', default=False,
+                        help='print details for all runs, not just failures')
     parser.add_argument('test', help='Test executable', nargs='?')
 
     return parser
@@ -35,6 +37,9 @@ def create_argparser() -> argparse.ArgumentParser:
 
 # Necessary functions for running tests
 class CeedSuiteSpec(SuiteSpec):
+    def __init__(self):
+        pass
+
     def get_source_path(self, test: str) -> Path:
         """Compute path to test source file
 
@@ -45,6 +50,8 @@ def get_source_path(self, test: str) -> Path:
             Path: Path to source file
         """
         prefix, rest = test.split('-', 1)
+        if prefix == 'rustqfunctions':
+            return (Path('examples') / 'rust-qfunctions' / rest).with_suffix('.c')
         if prefix == 'petsc':
             return (Path('examples') / 'petsc' / rest).with_suffix('.c')
         elif prefix == 'mfem':
@@ -58,7 +65,10 @@ def get_source_path(self, test: str) -> Path:
         elif prefix == 'solids':
             return (Path('examples') / 'solids' / rest).with_suffix('.c')
         elif test.startswith('ex'):
-            return (Path('examples') / 'ceed' / test).with_suffix('.c')
+            if test.endswith('-f'):
+                return (Path('examples') / 'ceed' / test).with_suffix('.f90')
+            else:
+                return (Path('examples') / 'ceed' / test).with_suffix('.c')
         elif test.endswith('-f'):
             return (Path('tests') / test).with_suffix('.f90')
         else:
@@ -100,9 +110,6 @@ def check_pre_skip(self, test: str, spec: TestSpec, resource: str, nproc: int) -
         Returns:
             Optional[str]: Skip reason, or `None` if test case should not be skipped
         """
-        if contains_any(resource, ['occa']) and startswith_any(
-                test, ['t4', 't5', 'ex', 'mfem', 'nek', 'petsc', 'fluids', 'solids']):
-            return 'OCCA mode not supported'
         if test.startswith('t318') and contains_any(resource, ['/gpu/cuda/ref']):
             return 'CUDA ref backend not supported'
         if test.startswith('t506') and contains_any(resource, ['/gpu/cuda/shared']):
@@ -123,9 +130,7 @@ def check_post_skip(self, test: str, spec: TestSpec, resource: str, stderr: str)
         Returns:
             Optional[str]: Skip reason, or `None` if unexpeced error
         """
-        if 'OCCA backend failed to use' in stderr:
-            return f'OCCA mode not supported'
-        elif 'Backend does not implement' in stderr:
+        if 'Backend does not implement' in stderr:
             return f'Backend does not implement'
         elif 'Can only provide HOST memory for this backend' in stderr:
             return f'Device memory not supported'
@@ -170,7 +175,7 @@ def check_required_failure(self, test: str, spec: TestSpec, resource: str, stder
         elif test_id in ['t215']:
             fail_str = 'Cannot destroy CeedElemRestriction, a process has read access to the offset data'
         elif test_id in ['t303']:
-            fail_str = 'Length of input/output vectors incompatible with basis dimensions'
+            fail_str = 'Input/output vectors too short for basis and evaluation mode'
         elif test_id in ['t408']:
             fail_str = 'CeedQFunctionContextGetData(): Cannot grant CeedQFunctionContext data access, a process has read access'
         elif test_id in ['t409'] and contains_any(resource, ['memcheck']):
@@ -193,46 +198,18 @@ def check_allowed_stdout(self, test: str) -> bool:
 if __name__ == '__main__':
     args = create_argparser().parse_args()
 
-    # run tests
-    if 'smartsim' in args.test:
-        has_smartsim: bool = args.smartredis_dir and Path(args.smartredis_dir).is_dir()
-        test_cases = []
-
-        if args.mode is RunMode.TAP:
-            print(f'1..1')
-        if has_smartsim:
-            sys.path.insert(0, str(Path(__file__).parents[1] / "examples" / "fluids"))
-            from smartsim_regression_framework import SmartSimTest
-
-            test_framework = SmartSimTest(Path(__file__).parent / 'test_dir')
-            test_framework.setup()
-
-            is_new_subtest = True
-            subtest_ok = True
-            for i, backend in enumerate(args.ceed_backends):
-                test_cases.append(test_framework.test_junit(backend))
-                if is_new_subtest and args.mode == RunMode.TAP:
-                    is_new_subtest = False
-                    print(f'# Subtest: {test_cases[0].category}')
-                    print(f'    1..{len(args.ceed_backends)}')
-                print(test_case_output_string(test_cases[i], TestSpec("SmartSim Tests"), args.mode, backend, '', i))
-            if args.mode == RunMode.TAP:
-                print(f'{"" if subtest_ok else "not "}ok 1 - {test_cases[0].category}')
-            test_framework.teardown()
-        elif args.mode is RunMode.TAP:
-            print(f'ok 1 - # SKIP SmartSim not installed')
-        result: TestSuite = TestSuite('SmartSim Tests', test_cases)
-    else:
-        result: TestSuite = run_tests(
-            args.test,
-            args.ceed_backends,
-            args.mode,
-            args.nproc,
-            CeedSuiteSpec(),
-            args.pool_size)
+    result: TestSuite = run_tests(
+        args.test,
+        args.ceed_backends,
+        args.mode,
+        args.nproc,
+        CeedSuiteSpec(),
+        args.pool_size,
+        search=args.search,
+        verbose=args.verbose)
 
     # write output and check for failures
     if args.mode is RunMode.JUNIT:
-        write_junit_xml(result, args.output, args.junit_batch)
+        write_junit_xml(result, args.junit_batch)
         if has_failures(result):
             sys.exit(1)
diff --git a/tests/junit_common.py b/tests/junit_common.py
index 607d21e9ee..ce1115a547 100644
--- a/tests/junit_common.py
+++ b/tests/junit_common.py
@@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
+from collections.abc import Iterable
 import argparse
 import csv
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, fields
 import difflib
 from enum import Enum
 from math import isclose
@@ -10,52 +11,75 @@
 import re
 import subprocess
 import multiprocessing as mp
-from itertools import product
 import sys
 import time
-from typing import Optional, Tuple, List
+from typing import Optional, Tuple, List, Dict, Callable, Iterable, get_origin
+import shutil
 
 sys.path.insert(0, str(Path(__file__).parent / "junit-xml"))
 from junit_xml import TestCase, TestSuite, to_xml_report_string  # nopep8
 
 
+class ParseError(RuntimeError):
+    """A custom exception for failed parsing."""
+
+    def __init__(self, message):
+        super().__init__(message)
+
+
 class CaseInsensitiveEnumAction(argparse.Action):
     """Action to convert input values to lower case prior to converting to an Enum type"""
 
     def __init__(self, option_strings, dest, type, default, **kwargs):
-        if not (issubclass(type, Enum) and issubclass(type, str)):
-            raise ValueError(f"{type} must be a StrEnum or str and Enum")
+        if not issubclass(type, Enum):
+            raise ValueError(f"{type} must be an Enum")
         # store provided enum type
         self.enum_type = type
-        if isinstance(default, str):
+        if isinstance(default, self.enum_type):
+            pass
+        elif isinstance(default, str):
             default = self.enum_type(default.lower())
-        else:
+        elif isinstance(default, Iterable):
             default = [self.enum_type(v.lower()) for v in default]
+        else:
+            raise argparse.ArgumentTypeError("Invalid value type, must be str or iterable")
         # prevent automatic type conversion
         super().__init__(option_strings, dest, default=default, **kwargs)
 
     def __call__(self, parser, namespace, values, option_string=None):
-        if isinstance(values, str):
+        if isinstance(values, self.enum_type):
+            pass
+        elif isinstance(values, str):
             values = self.enum_type(values.lower())
-        else:
+        elif isinstance(values, Iterable):
             values = [self.enum_type(v.lower()) for v in values]
+        else:
+            raise argparse.ArgumentTypeError("Invalid value type, must be str or iterable")
         setattr(namespace, self.dest, values)
 
 
 @dataclass
 class TestSpec:
     """Dataclass storing information about a single test case"""
-    name: str
+    name: str = field(default_factory=str)
+    csv_rtol: float = -1
+    csv_ztol: float = -1
+    cgns_tol: float = -1
     only: List = field(default_factory=list)
     args: List = field(default_factory=list)
+    key_values: Dict = field(default_factory=dict)
 
 
-class RunMode(str, Enum):
+class RunMode(Enum):
     """Enumeration of run modes, either `RunMode.TAP` or `RunMode.JUNIT`"""
-    __str__ = str.__str__
-    __format__ = str.__format__
-    TAP: str = 'tap'
-    JUNIT: str = 'junit'
+    TAP = 'tap'
+    JUNIT = 'junit'
+
+    def __str__(self):
+        return self.value
+
+    def __repr__(self):
+        return self.value
 
 
 class SuiteSpec(ABC):
@@ -97,6 +121,11 @@ def get_output_path(self, test: str, output_file: str) -> Path:
         """
         raise NotImplementedError
 
+    @property
+    def test_failure_artifacts_path(self) -> Path:
+        """Path to test failure artifacts"""
+        return Path('build') / 'test_failure_artifacts'
+
     @property
     def cgns_tol(self):
         """Absolute tolerance for CGNS diff"""
@@ -106,7 +135,41 @@ def cgns_tol(self):
     def cgns_tol(self, val):
         self._cgns_tol = val
 
-    def post_test_hook(self, test: str, spec: TestSpec) -> None:
+    @property
+    def csv_ztol(self):
+        """Keyword arguments to be passed to diff_csv()"""
+        return getattr(self, '_csv_ztol', 3e-10)
+
+    @csv_ztol.setter
+    def csv_ztol(self, val):
+        self._csv_ztol = val
+
+    @property
+    def csv_rtol(self):
+        """Keyword arguments to be passed to diff_csv()"""
+        return getattr(self, '_csv_rtol', 1e-6)
+
+    @csv_rtol.setter
+    def csv_rtol(self, val):
+        self._csv_rtol = val
+
+    @property
+    def csv_comment_diff_fn(self):  # -> Any | Callable[..., None]:
+        return getattr(self, '_csv_comment_diff_fn', None)
+
+    @csv_comment_diff_fn.setter
+    def csv_comment_diff_fn(self, test_fn):
+        self._csv_comment_diff_fn = test_fn
+
+    @property
+    def csv_comment_str(self):
+        return getattr(self, '_csv_comment_str', '#')
+
+    @csv_comment_str.setter
+    def csv_comment_str(self, comment_str):
+        self._csv_comment_str = comment_str
+
+    def post_test_hook(self, test: str, spec: TestSpec, backend: str) -> None:
         """Function callback ran after each test case
 
         Args:
@@ -181,7 +244,7 @@ def has_cgnsdiff() -> bool:
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE,
                           env=my_env)
-    return 'not found' not in proc.stderr.decode('utf-8')
+    return 'not found' not in proc.stderr.decode('utf-8', errors='replace')
 
 
 def contains_any(base: str, substrings: List[str]) -> bool:
@@ -210,7 +273,40 @@ def startswith_any(base: str, prefixes: List[str]) -> bool:
     return any((base.startswith(prefix) for prefix in prefixes))
 
 
-def parse_test_line(line: str) -> TestSpec:
+def find_matching(line: str, open: str = '(', close: str = ')') -> Tuple[int, int]:
+    """Find the start and end positions of the first outer paired delimeters
+
+    Args:
+        line (str): Line to search
+        open (str, optional): Opening delimiter, must be different than `close`. Defaults to '('.
+        close (str, optional): Closing delimeter, must be different than `open`. Defaults to ')'.
+
+    Raises:
+        RuntimeError: If open or close is not a single character
+        RuntimeError: If open and close are the same characters
+
+    Returns:
+        Tuple[int]: If matching delimeters are found, return indices in `list`. Otherwise, return end < start.
+    """
+    if len(open) != 1 or len(close) != 1:
+        raise RuntimeError("`open` and `close` must be single characters")
+    if open == close:
+        raise RuntimeError("`open` and `close` must be different characters")
+    start: int = line.find(open)
+    if start < 0:
+        return -1, -1
+    count: int = 1
+    for i in range(start + 1, len(line)):
+        if line[i] == open:
+            count += 1
+        if line[i] == close:
+            count -= 1
+            if count == 0:
+                return start, i
+    return start, -1
+
+
+def parse_test_line(line: str, fallback_name: str = '') -> TestSpec:
     """Parse a single line of TESTARGS and CLI arguments into a `TestSpec` object
 
     Args:
@@ -219,18 +315,61 @@ def parse_test_line(line: str) -> TestSpec:
     Returns:
         TestSpec: Parsed specification of test case
     """
-    args: List[str] = re.findall("(?:\".*?\"|\\S)+", line.strip())
-    if args[0] == 'TESTARGS':
-        return TestSpec(name='', args=args[1:])
-    raw_test_args: str = args[0][args[0].index('TESTARGS(') + 9:args[0].rindex(')')]
-    # transform 'name="myname",only="serial,int32"' into {'name': 'myname', 'only': 'serial,int32'}
-    test_args: dict = dict([''.join(t).split('=') for t in re.findall(r"""([^,=]+)(=)"([^"]*)\"""", raw_test_args)])
-    name: str = test_args.get('name', '')
-    constraints: List[str] = test_args['only'].split(',') if 'only' in test_args else []
-    if len(args) > 1:
-        return TestSpec(name=name, only=constraints, args=args[1:])
-    else:
-        return TestSpec(name=name, only=constraints)
+    test_fields = fields(TestSpec)
+    field_names = [f.name for f in test_fields]
+    known: Dict = dict()
+    other: Dict = dict()
+    if line[0] == "(":
+        # have key/value pairs to parse
+        start, end = find_matching(line)
+        if end < start:
+            raise ParseError(f"Mismatched parentheses in TESTCASE: {line}")
+
+        keyvalues_str = line[start:end + 1]
+        keyvalues_pattern = re.compile(r'''
+            (?:\(\s*|\s*,\s*)   # start with open parentheses or comma, no capture
+            ([A-Za-z]+[\w\-]+)  # match key starting with alpha, containing alphanumeric, _, or -; captured as Group 1
+            \s*=\s*             # key is followed by = (whitespace ignored)
+            (?:                 # uncaptured group for OR
+              "((?:[^"]|\\")+)" #   match quoted value (any internal " must be escaped as \"); captured as Group 2
+            | ([^=]+)           #   OR match unquoted value (no equals signs allowed); captured as Group 3
+            )                   # end uncaptured group for OR
+            \s*(?=,|\))         # lookahead for either next comma or closing parentheses
+        ''', re.VERBOSE)
+
+        for match in re.finditer(keyvalues_pattern, keyvalues_str):
+            if not match:  # empty
+                continue
+            key = match.group(1)
+            value = match.group(2) if match.group(2) else match.group(3)
+            try:
+                index = field_names.index(key)
+                if key == "only":  # weird bc only is a list
+                    value = [constraint.strip() for constraint in value.split(',')]
+                try:
+                    # TODO: stop supporting python <=3.8
+                    known[key] = test_fields[index].type(value)  # type: ignore
+                except TypeError:
+                    # TODO: this is still liable to fail for complex types
+                    known[key] = get_origin(test_fields[index].type)(value)  # type: ignore
+            except ValueError:
+                other[key] = value
+
+        line = line[end + 1:]
+
+    if not 'name' in known.keys():
+        known['name'] = fallback_name
+
+    args_pattern = re.compile(r'''
+        \s+(            # remove leading space
+            (?:"[^"]+") # match quoted CLI option
+          | (?:[\S]+)   # match anything else that is space separated
+        )
+    ''', re.VERBOSE)
+    args: List[str] = re.findall(args_pattern, line)
+    for k, v in other.items():
+        print(f"warning, unknown TESTCASE option for test '{known['name']}': {k}={v}")
+    return TestSpec(**known, key_values=other, args=args)
 
 
 def get_test_args(source_file: Path) -> List[TestSpec]:
@@ -257,19 +396,22 @@ def get_test_args(source_file: Path) -> List[TestSpec]:
     else:
         raise RuntimeError(f'Unrecognized extension for file: {source_file}')
 
-    return [parse_test_line(line.strip(comment_str))
+    return [parse_test_line(line.strip(comment_str).removeprefix("TESTARGS"), source_file.stem)
             for line in source_file.read_text().splitlines()
-            if line.startswith(f'{comment_str}TESTARGS')] or [TestSpec('', args=['{ceed_resource}'])]
+            if line.startswith(f'{comment_str}TESTARGS')] or [TestSpec(source_file.stem, args=['{ceed_resource}'])]
 
 
-def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: float = 1e-2) -> str:
+def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float, rel_tol: float,
+             comment_str: str = '#', comment_func: Optional[Callable[[str, str], Optional[str]]] = None) -> str:
     """Compare CSV results against an expected CSV file with tolerances
 
     Args:
         test_csv (Path): Path to output CSV results
         true_csv (Path): Path to expected CSV results
-        zero_tol (float, optional): Tolerance below which values are considered to be zero. Defaults to 3e-10.
-        rel_tol (float, optional): Relative tolerance for comparing non-zero values. Defaults to 1e-2.
+        zero_tol (float): Tolerance below which values are considered to be zero.
+        rel_tol (float): Relative tolerance for comparing non-zero values.
+        comment_str (str, optional): String to denoting commented line
+        comment_func (Callable, optional): Function to determine if test and true line are different
 
     Returns:
         str: Diff output between result and expected CSVs
@@ -281,15 +423,38 @@ def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: f
         return f'No lines found in test output {test_csv}'
     if len(true_lines) == 0:
         return f'No lines found in test source {true_csv}'
+    if len(test_lines) != len(true_lines):
+        return f'Number of lines in {test_csv} and {true_csv} do not match'
+
+    # Process commented lines
+    uncommented_lines: List[int] = []
+    for n, (test_line, true_line) in enumerate(zip(test_lines, true_lines)):
+        if test_line[0] == comment_str and true_line[0] == comment_str:
+            if comment_func:
+                output = comment_func(test_line, true_line)
+                if output:
+                    return output
+        elif test_line[0] == comment_str and true_line[0] != comment_str:
+            return f'Commented line found in {test_csv} at line {n} but not in {true_csv}'
+        elif test_line[0] != comment_str and true_line[0] == comment_str:
+            return f'Commented line found in {true_csv} at line {n} but not in {test_csv}'
+        else:
+            uncommented_lines.append(n)
+
+    # Remove commented lines
+    test_lines = [test_lines[line] for line in uncommented_lines]
+    true_lines = [true_lines[line] for line in uncommented_lines]
 
     test_reader: csv.DictReader = csv.DictReader(test_lines)
     true_reader: csv.DictReader = csv.DictReader(true_lines)
+    if not test_reader.fieldnames:
+        return f'No CSV columns found in test output {test_csv}'
+    if not true_reader.fieldnames:
+        return f'No CSV columns found in test source {true_csv}'
     if test_reader.fieldnames != true_reader.fieldnames:
         return ''.join(difflib.unified_diff([f'{test_lines[0]}\n'], [f'{true_lines[0]}\n'],
                        tofile='found CSV columns', fromfile='expected CSV columns'))
 
-    if len(test_lines) != len(true_lines):
-        return f'Number of lines in {test_csv} and {true_csv} do not match'
     diff_lines: List[str] = list()
     for test_line, true_line in zip(test_reader, true_reader):
         for key in test_reader.fieldnames:
@@ -313,13 +478,13 @@ def diff_csv(test_csv: Path, true_csv: Path, zero_tol: float = 3e-10, rel_tol: f
     return '\n'.join(diff_lines)
 
 
-def diff_cgns(test_cgns: Path, true_cgns: Path, cgns_tol: float = 1e-12) -> str:
+def diff_cgns(test_cgns: Path, true_cgns: Path, cgns_tol: float) -> str:
     """Compare CGNS results against an expected CGSN file with tolerance
 
     Args:
         test_cgns (Path): Path to output CGNS file
         true_cgns (Path): Path to expected CGNS file
-        cgns_tol (float, optional): Tolerance for comparing floating-point values
+        cgns_tol (float): Tolerance for comparing floating-point values
 
     Returns:
         str: Diff output between result and expected CGNS files
@@ -333,35 +498,63 @@ def diff_cgns(test_cgns: Path, true_cgns: Path, cgns_tol: float = 1e-12) -> str:
                           stderr=subprocess.PIPE,
                           env=my_env)
 
-    return proc.stderr.decode('utf-8') + proc.stdout.decode('utf-8')
+    return proc.stderr.decode('utf-8', errors='replace') + proc.stdout.decode('utf-8', errors='replace')
+
+
+def diff_ascii(test_file: Path, true_file: Path, backend: str) -> str:
+    """Compare ASCII results against an expected ASCII file
+
+    Args:
+        test_file (Path): Path to output ASCII file
+        true_file (Path): Path to expected ASCII file
+
+    Returns:
+        str: Diff output between result and expected ASCII files
+    """
+    tmp_backend: str = backend.replace('/', '-')
+    true_str: str = true_file.read_text().replace('{ceed_resource}', tmp_backend)
+    diff = list(difflib.unified_diff(test_file.read_text().splitlines(keepends=True),
+                                     true_str.splitlines(keepends=True),
+                                     fromfile=str(test_file),
+                                     tofile=str(true_file)))
+    return ''.join(diff)
 
 
 def test_case_output_string(test_case: TestCase, spec: TestSpec, mode: RunMode,
-                            backend: str, test: str, index: int) -> str:
+                            backend: str, test: str, index: int, verbose: bool) -> str:
     output_str = ''
     if mode is RunMode.TAP:
         # print incremental output if TAP mode
         if test_case.is_skipped():
             output_str += f'    ok {index} - {spec.name}, {backend} # SKIP {test_case.skipped[0]["message"]}\n'
         elif test_case.is_failure() or test_case.is_error():
-            output_str += f'    not ok {index} - {spec.name}, {backend}\n'
+            output_str += f'    not ok {index} - {spec.name}, {backend} ({test_case.elapsed_sec} s)\n'
         else:
-            output_str += f'    ok {index} - {spec.name}, {backend}\n'
-        output_str += f'      ---\n'
-        if spec.only:
-            output_str += f'      only: {",".join(spec.only)}\n'
-        output_str += f'      args: {test_case.args}\n'
-        if test_case.is_error():
-            output_str += f'      error: {test_case.errors[0]["message"]}\n'
-        if test_case.is_failure():
-            output_str += f'      num_failures: {len(test_case.failures)}\n'
-            for i, failure in enumerate(test_case.failures):
-                output_str += f'      failure_{i}: {failure["message"]}\n'
-                output_str += f'        message: {failure["message"]}\n'
-                if failure["output"]:
-                    out = failure["output"].strip().replace('\n', '\n          ')
-                    output_str += f'        output: |\n          {out}\n'
-        output_str += f'      ...\n'
+            output_str += f'    ok {index} - {spec.name}, {backend} ({test_case.elapsed_sec} s)\n'
+        if test_case.is_failure() or test_case.is_error() or verbose:
+            output_str += f'      ---\n'
+            if spec.only:
+                output_str += f'      only: {",".join(spec.only)}\n'
+            output_str += f'      args: {test_case.args}\n'
+            if spec.csv_ztol > 0:
+                output_str += f'      csv_ztol: {spec.csv_ztol}\n'
+            if spec.csv_rtol > 0:
+                output_str += f'      csv_rtol: {spec.csv_rtol}\n'
+            if spec.cgns_tol > 0:
+                output_str += f'      cgns_tol: {spec.cgns_tol}\n'
+            for k, v in spec.key_values.items():
+                output_str += f'      {k}: {v}\n'
+            if test_case.is_error():
+                output_str += f'      error: {test_case.errors[0]["message"]}\n'
+            if test_case.is_failure():
+                output_str += f'      failures:\n'
+                for i, failure in enumerate(test_case.failures):
+                    output_str += f'        -\n'
+                    output_str += f'          message: {failure["message"]}\n'
+                    if failure["output"]:
+                        out = failure["output"].strip().replace('\n', '\n            ')
+                        output_str += f'          output: |\n            {out}\n'
+            output_str += f'      ...\n'
     else:
         # print error or failure information if JUNIT mode
         if test_case.is_error() or test_case.is_failure():
@@ -377,8 +570,20 @@ def test_case_output_string(test_case: TestCase, spec: TestSpec, mode: RunMode,
     return output_str
 
 
+def save_failure_artifact(suite_spec: SuiteSpec, file: Path) -> Path:
+    """Attach a file to a test case
+
+    Args:
+        test_case (TestCase): Test case to attach the file to
+        file (Path): Path to the file to attach
+    """
+    save_path: Path = suite_spec.test_failure_artifacts_path / file.name
+    shutil.copyfile(file, save_path)
+    return save_path
+
+
 def run_test(index: int, test: str, spec: TestSpec, backend: str,
-             mode: RunMode, nproc: int, suite_spec: SuiteSpec) -> TestCase:
+             mode: RunMode, nproc: int, suite_spec: SuiteSpec, verbose: bool = False) -> TestCase:
     """Run a single test case and backend combination
 
     Args:
@@ -389,6 +594,7 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
         mode (RunMode): Output mode
         nproc (int): Number of MPI processes to use when running test case
         suite_spec (SuiteSpec): Specification of test suite
+        verbose (bool, optional): Print detailed output for all runs, not just failures. Defaults to False.
 
     Returns:
         TestCase: Test case result
@@ -407,7 +613,7 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
         run_args = ['mpiexec', '-n', f'{nproc}', *run_args]
 
     # run test
-    skip_reason: str = suite_spec.check_pre_skip(test, spec, backend, nproc)
+    skip_reason: Optional[str] = suite_spec.check_pre_skip(test, spec, backend, nproc)
     if skip_reason:
         test_case: TestCase = TestCase(f'{test}, "{spec.name}", n{nproc}, {backend}',
                                        elapsed_sec=0,
@@ -428,24 +634,28 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
                              classname=source_path.parent,
                              elapsed_sec=time.time() - start,
                              timestamp=time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime(start)),
-                             stdout=proc.stdout.decode('utf-8'),
-                             stderr=proc.stderr.decode('utf-8'),
+                             stdout=proc.stdout.decode('utf-8', errors='replace'),
+                             stderr=proc.stderr.decode('utf-8', errors='replace'),
                              allow_multiple_subelements=True,
                              category=spec.name,)
         ref_csvs: List[Path] = []
-        output_files: List[str] = [arg for arg in run_args if 'ascii:' in arg]
+        ref_ascii: List[Path] = []
+        output_files: List[str] = [arg.split(':')[1] for arg in run_args if arg.startswith('ascii:')]
         if output_files:
-            ref_csvs = [suite_spec.get_output_path(test, file.split('ascii:')[-1]) for file in output_files]
+            ref_csvs = [suite_spec.get_output_path(test, file)
+                        for file in output_files if file.endswith('.csv')]
+            ref_ascii = [suite_spec.get_output_path(test, file)
+                         for file in output_files if not file.endswith('.csv')]
         ref_cgns: List[Path] = []
-        output_files = [arg for arg in run_args if 'cgns:' in arg]
+        output_files = [arg.split(':')[1] for arg in run_args if arg.startswith('cgns:')]
         if output_files:
-            ref_cgns = [suite_spec.get_output_path(test, file.split('cgns:')[-1]) for file in output_files]
+            ref_cgns = [suite_spec.get_output_path(test, file) for file in output_files]
         ref_stdout: Path = suite_spec.get_output_path(test, test + '.out')
-        suite_spec.post_test_hook(test, spec)
+        suite_spec.post_test_hook(test, spec, backend)
 
     # check allowed failures
     if not test_case.is_skipped() and test_case.stderr:
-        skip_reason: str = suite_spec.check_post_skip(test, spec, backend, test_case.stderr)
+        skip_reason: Optional[str] = suite_spec.check_post_skip(test, spec, backend, test_case.stderr)
         if skip_reason:
             test_case.add_skipped_info(skip_reason)
 
@@ -460,7 +670,12 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
 
     # classify other results
     if not test_case.is_skipped() and not test_case.status:
-        if test_case.stderr:
+        # Filter out chipStar (CHIP) runtime informational/warning lines which are not errors
+        filtered_stderr = '\n'.join(
+            line for line in test_case.stderr.split('\n')
+            if not line.startswith(('CHIP info ', 'CHIP warning ', 'CHIP debug '))
+        ).strip()
+        if filtered_stderr:
             test_case.add_failure_info('stderr', test_case.stderr)
         if proc.returncode != 0:
             test_case.add_error_info(f'returncode = {proc.returncode}')
@@ -476,35 +691,73 @@ def run_test(index: int, test: str, spec: TestSpec, backend: str,
         # expected CSV output
         for ref_csv in ref_csvs:
             csv_name = ref_csv.name
+            out_file = Path.cwd() / csv_name
             if not ref_csv.is_file():
                 # remove _{ceed_backend} from path name
                 ref_csv = (ref_csv.parent / ref_csv.name.rsplit('_', 1)[0]).with_suffix('.csv')
             if not ref_csv.is_file():
                 test_case.add_failure_info('csv', output=f'{ref_csv} not found')
+            elif not out_file.is_file():
+                test_case.add_failure_info('csv', output=f'{out_file} not found')
             else:
-                diff: str = diff_csv(Path.cwd() / csv_name, ref_csv)
+                csv_ztol: float = spec.csv_ztol if spec.csv_ztol > 0 else suite_spec.csv_ztol
+                csv_rtol: float = spec.csv_rtol if spec.csv_rtol > 0 else suite_spec.csv_rtol
+                diff = diff_csv(
+                    out_file,
+                    ref_csv,
+                    csv_ztol,
+                    csv_rtol,
+                    suite_spec.csv_comment_str,
+                    suite_spec.csv_comment_diff_fn)
                 if diff:
-                    test_case.add_failure_info('csv', output=diff)
+                    save_path: Path = suite_spec.test_failure_artifacts_path / csv_name
+                    shutil.move(out_file, save_path)
+                    test_case.add_failure_info(f'csv: {save_path}', output=diff)
                 else:
-                    (Path.cwd() / csv_name).unlink()
+                    out_file.unlink()
         # expected CGNS output
         for ref_cgn in ref_cgns:
             cgn_name = ref_cgn.name
+            out_file = Path.cwd() / cgn_name
             if not ref_cgn.is_file():
                 # remove _{ceed_backend} from path name
                 ref_cgn = (ref_cgn.parent / ref_cgn.name.rsplit('_', 1)[0]).with_suffix('.cgns')
             if not ref_cgn.is_file():
                 test_case.add_failure_info('cgns', output=f'{ref_cgn} not found')
+            elif not out_file.is_file():
+                test_case.add_failure_info('cgns', output=f'{out_file} not found')
+            else:
+                cgns_tol = spec.cgns_tol if spec.cgns_tol > 0 else suite_spec.cgns_tol
+                diff = diff_cgns(out_file, ref_cgn, cgns_tol=cgns_tol)
+                if diff:
+                    save_path: Path = suite_spec.test_failure_artifacts_path / cgn_name
+                    shutil.move(out_file, save_path)
+                    test_case.add_failure_info(f'cgns: {save_path}', output=diff)
+                else:
+                    out_file.unlink()
+        # expected ASCII output
+        for ref_file in ref_ascii:
+            ref_name = ref_file.name
+            out_file = Path.cwd() / ref_name
+            if not ref_file.is_file():
+                # remove _{ceed_backend} from path name
+                ref_file = (ref_file.parent / ref_file.name.rsplit('_', 1)[0]).with_suffix(ref_file.suffix)
+            if not ref_file.is_file():
+                test_case.add_failure_info('ascii', output=f'{ref_file} not found')
+            elif not out_file.is_file():
+                test_case.add_failure_info('ascii', output=f'{out_file} not found')
             else:
-                diff = diff_cgns(Path.cwd() / cgn_name, ref_cgn, cgns_tol=suite_spec.cgns_tol)
+                diff = diff_ascii(out_file, ref_file, backend)
                 if diff:
-                    test_case.add_failure_info('cgns', output=diff)
+                    save_path: Path = suite_spec.test_failure_artifacts_path / ref_name
+                    shutil.move(out_file, save_path)
+                    test_case.add_failure_info(f'ascii: {save_path}', output=diff)
                 else:
-                    (Path.cwd() / cgn_name).unlink()
+                    out_file.unlink()
 
     # store result
     test_case.args = ' '.join(str(arg) for arg in run_args)
-    output_str = test_case_output_string(test_case, spec, mode, backend, test, index)
+    output_str = test_case_output_string(test_case, spec, mode, backend, test, index, verbose)
 
     return test_case, output_str
 
@@ -518,7 +771,7 @@ def init_process():
 
 
 def run_tests(test: str, ceed_backends: List[str], mode: RunMode, nproc: int,
-              suite_spec: SuiteSpec, pool_size: int = 1) -> TestSuite:
+              suite_spec: SuiteSpec, pool_size: int = 1, search: str = ".*", verbose: bool = False) -> TestSuite:
     """Run all test cases for `test` with each of the provided `ceed_backends`
 
     Args:
@@ -528,18 +781,23 @@ def run_tests(test: str, ceed_backends: List[str], mode: RunMode, nproc: int,
         nproc (int): Number of MPI processes to use when running each test case
         suite_spec (SuiteSpec): Object defining required methods for running tests
         pool_size (int, optional): Number of processes to use when running tests in parallel. Defaults to 1.
+        search (str, optional): Regular expression used to match tests. Defaults to ".*".
+        verbose (bool, optional): Print detailed output for all runs, not just failures. Defaults to False.
 
     Returns:
         TestSuite: JUnit `TestSuite` containing results of all test cases
     """
-    test_specs: List[TestSpec] = get_test_args(suite_spec.get_source_path(test))
+    test_specs: List[TestSpec] = [
+        t for t in get_test_args(suite_spec.get_source_path(test)) if re.search(search, t.name, re.IGNORECASE)
+    ]
+    suite_spec.test_failure_artifacts_path.mkdir(parents=True, exist_ok=True)
     if mode is RunMode.TAP:
         print('TAP version 13')
         print(f'1..{len(test_specs)}')
 
     with mp.Pool(processes=pool_size, initializer=init_process) as pool:
-        async_outputs: List[List[mp.AsyncResult]] = [
-            [pool.apply_async(run_test, (i, test, spec, backend, mode, nproc, suite_spec))
+        async_outputs: List[List[mp.pool.AsyncResult]] = [
+            [pool.apply_async(run_test, (i, test, spec, backend, mode, nproc, suite_spec, verbose))
              for (i, backend) in enumerate(ceed_backends, start=1)]
             for spec in test_specs
         ]
@@ -564,15 +822,14 @@ def run_tests(test: str, ceed_backends: List[str], mode: RunMode, nproc: int,
     return TestSuite(test, test_cases)
 
 
-def write_junit_xml(test_suite: TestSuite, output_file: Optional[Path], batch: str = '') -> None:
+def write_junit_xml(test_suite: TestSuite, batch: str = '') -> None:
     """Write a JUnit XML file containing the results of a `TestSuite`
 
     Args:
         test_suite (TestSuite): JUnit `TestSuite` to write
-        output_file (Optional[Path]): Path to output file, or `None` to generate automatically as `build/{test_suite.name}{batch}.junit`
         batch (str): Name of JUnit batch, defaults to empty string
     """
-    output_file: Path = output_file or Path('build') / (f'{test_suite.name}{batch}.junit')
+    output_file = Path('build') / (f'{test_suite.name}{batch}.junit')
     output_file.write_text(to_xml_report_string([test_suite]))
 
 
diff --git a/tests/output/t107-vector-f.out b/tests/output/t107-vector-f.out
index c4823d39c7..f3faa3e8ea 100644
--- a/tests/output/t107-vector-f.out
+++ b/tests/output/t107-vector-f.out
@@ -9,3 +9,14 @@ CeedVector length 10
    17.00000000
    18.00000000
    19.00000000
+  CeedVector length 10
+     10.00000000
+     11.00000000
+     12.00000000
+     13.00000000
+     14.00000000
+     15.00000000
+     16.00000000
+     17.00000000
+     18.00000000
+     19.00000000
diff --git a/tests/output/t107-vector.out b/tests/output/t107-vector.out
index c4823d39c7..f3faa3e8ea 100644
--- a/tests/output/t107-vector.out
+++ b/tests/output/t107-vector.out
@@ -9,3 +9,14 @@ CeedVector length 10
    17.00000000
    18.00000000
    19.00000000
+  CeedVector length 10
+     10.00000000
+     11.00000000
+     12.00000000
+     13.00000000
+     14.00000000
+     15.00000000
+     16.00000000
+     17.00000000
+     18.00000000
+     19.00000000
diff --git a/tests/output/t210-elemrestriction-f.out b/tests/output/t210-elemrestriction-f.out
index 0696c8ce32..22990a413e 100644
--- a/tests/output/t210-elemrestriction-f.out
+++ b/tests/output/t210-elemrestriction-f.out
@@ -1 +1,2 @@
 CeedElemRestriction from (4, 1) to 3 elements with 2 nodes each and component stride 1
+  CeedElemRestriction from (4, 1) to 3 elements with 2 nodes each and component stride 1
diff --git a/tests/output/t210-elemrestriction.out b/tests/output/t210-elemrestriction.out
index 0696c8ce32..22990a413e 100644
--- a/tests/output/t210-elemrestriction.out
+++ b/tests/output/t210-elemrestriction.out
@@ -1 +1,2 @@
 CeedElemRestriction from (4, 1) to 3 elements with 2 nodes each and component stride 1
+  CeedElemRestriction from (4, 1) to 3 elements with 2 nodes each and component stride 1
diff --git a/tests/output/t211-elemrestriction-f.out b/tests/output/t211-elemrestriction-f.out
index af26a4a612..b2d7a029c4 100644
--- a/tests/output/t211-elemrestriction-f.out
+++ b/tests/output/t211-elemrestriction-f.out
@@ -1 +1,2 @@
 CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
+  CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
diff --git a/tests/output/t211-elemrestriction.out b/tests/output/t211-elemrestriction.out
index af26a4a612..b2d7a029c4 100644
--- a/tests/output/t211-elemrestriction.out
+++ b/tests/output/t211-elemrestriction.out
@@ -1 +1,2 @@
 CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
+  CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
diff --git a/tests/output/t212-elemrestriction-f.out b/tests/output/t212-elemrestriction-f.out
index a5cd6de40b..7d72d8c00f 100644
--- a/tests/output/t212-elemrestriction-f.out
+++ b/tests/output/t212-elemrestriction-f.out
@@ -1 +1,2 @@
 Blocked CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
+  Blocked CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
diff --git a/tests/output/t212-elemrestriction.out b/tests/output/t212-elemrestriction.out
index a5cd6de40b..7d72d8c00f 100644
--- a/tests/output/t212-elemrestriction.out
+++ b/tests/output/t212-elemrestriction.out
@@ -1 +1,2 @@
 Blocked CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
+  Blocked CeedElemRestriction from (6, 1) to 3 elements with 2 nodes each and strides [1, 2, 2]
diff --git a/tests/output/t300-basis-f.out b/tests/output/t300-basis-f.out
index ebbe0f9635..5ab53a4686 100644
--- a/tests/output/t300-basis-f.out
+++ b/tests/output/t300-basis-f.out
@@ -32,3 +32,20 @@ CeedBasis in a H^1 space on a line element
     [1]	 -0.51670214	 -0.48795249	  1.33790510	 -0.33325047
     [2]	  0.33325047	 -1.33790510	  0.48795249	  0.51670214
     [3]	 -0.18899664	  0.63510411	 -2.78794489	  2.34183742
+  CeedBasis in a H^1 space on a line element
+    P: 4
+    Q: 4
+    dimension: 1
+    field components: 1
+    qref1d:   	 -0.86113631	 -0.33998104	  0.33998104	  0.86113631
+    qweight1d:	  0.34785485	  0.65214515	  0.65214515	  0.34785485
+    interp1d:
+      [0]	  0.62994317	  0.47255875	 -0.14950343	  0.04700152
+      [1]	 -0.07069480	  0.97297619	  0.13253993	 -0.03482132
+      [2]	 -0.03482132	  0.13253993	  0.97297619	 -0.07069480
+      [3]	  0.04700152	 -0.14950343	  0.47255875	  0.62994317
+    grad1d:
+      [0]	 -2.34183742	  2.78794489	 -0.63510411	  0.18899664
+      [1]	 -0.51670214	 -0.48795249	  1.33790510	 -0.33325047
+      [2]	  0.33325047	 -1.33790510	  0.48795249	  0.51670214
+      [3]	 -0.18899664	  0.63510411	 -2.78794489	  2.34183742
diff --git a/tests/output/t300-basis.out b/tests/output/t300-basis.out
index ebbe0f9635..5ab53a4686 100644
--- a/tests/output/t300-basis.out
+++ b/tests/output/t300-basis.out
@@ -32,3 +32,20 @@ CeedBasis in a H^1 space on a line element
     [1]	 -0.51670214	 -0.48795249	  1.33790510	 -0.33325047
     [2]	  0.33325047	 -1.33790510	  0.48795249	  0.51670214
     [3]	 -0.18899664	  0.63510411	 -2.78794489	  2.34183742
+  CeedBasis in a H^1 space on a line element
+    P: 4
+    Q: 4
+    dimension: 1
+    field components: 1
+    qref1d:   	 -0.86113631	 -0.33998104	  0.33998104	  0.86113631
+    qweight1d:	  0.34785485	  0.65214515	  0.65214515	  0.34785485
+    interp1d:
+      [0]	  0.62994317	  0.47255875	 -0.14950343	  0.04700152
+      [1]	 -0.07069480	  0.97297619	  0.13253993	 -0.03482132
+      [2]	 -0.03482132	  0.13253993	  0.97297619	 -0.07069480
+      [3]	  0.04700152	 -0.14950343	  0.47255875	  0.62994317
+    grad1d:
+      [0]	 -2.34183742	  2.78794489	 -0.63510411	  0.18899664
+      [1]	 -0.51670214	 -0.48795249	  1.33790510	 -0.33325047
+      [2]	  0.33325047	 -1.33790510	  0.48795249	  0.51670214
+      [3]	 -0.18899664	  0.63510411	 -2.78794489	  2.34183742
diff --git a/tests/output/t320-basis-f.out b/tests/output/t320-basis-f.out
index a1522dd848..34c78eeaaf 100644
--- a/tests/output/t320-basis-f.out
+++ b/tests/output/t320-basis-f.out
@@ -19,3 +19,24 @@ CeedBasis in a H^1 space on a triangle element
     [5]	  0.20000000	 -2.40000000	  0.00000000	  0.00000000	  2.40000000	 -0.20000000
     [6]	 -0.33333333	 -1.33333333	  0.00000000	  0.00000000	  1.33333333	  0.33333333
     [7]	  0.20000000	 -0.80000000	  0.00000000	 -1.60000000	  0.80000000	  1.40000000
+  CeedBasis in a H^1 space on a triangle element
+    P: 6
+    Q: 4
+    dimension: 2
+    field components: 1
+    qref:     	  0.20000000	  0.60000000	  0.33333333	  0.20000000	  0.20000000	  0.20000000	  0.33333333	  0.60000000
+    qweight:  	  0.26041667	  0.26041667	 -0.28125000	  0.26041667
+    interp:
+      [0]	  0.12000000	  0.48000000	 -0.12000000	  0.48000000	  0.16000000	 -0.12000000
+      [1]	 -0.12000000	  0.48000000	  0.12000000	  0.16000000	  0.48000000	 -0.12000000
+      [2]	 -0.11111111	  0.44444444	 -0.11111111	  0.44444444	  0.44444444	 -0.11111111
+      [3]	 -0.12000000	  0.16000000	 -0.12000000	  0.48000000	  0.48000000	  0.12000000
+    grad:
+      [0]	 -1.40000000	  1.60000000	 -0.20000000	 -0.80000000	  0.80000000	  0.00000000
+      [1]	  0.20000000	 -1.60000000	  1.40000000	 -0.80000000	  0.80000000	  0.00000000
+      [2]	 -0.33333333	  0.00000000	  0.33333333	 -1.33333333	  1.33333333	  0.00000000
+      [3]	  0.20000000	  0.00000000	 -0.20000000	 -2.40000000	  2.40000000	  0.00000000
+      [4]	 -1.40000000	 -0.80000000	  0.00000000	  1.60000000	  0.80000000	 -0.20000000
+      [5]	  0.20000000	 -2.40000000	  0.00000000	  0.00000000	  2.40000000	 -0.20000000
+      [6]	 -0.33333333	 -1.33333333	  0.00000000	  0.00000000	  1.33333333	  0.33333333
+      [7]	  0.20000000	 -0.80000000	  0.00000000	 -1.60000000	  0.80000000	  1.40000000
diff --git a/tests/output/t320-basis.out b/tests/output/t320-basis.out
index a1522dd848..34c78eeaaf 100644
--- a/tests/output/t320-basis.out
+++ b/tests/output/t320-basis.out
@@ -19,3 +19,24 @@ CeedBasis in a H^1 space on a triangle element
     [5]	  0.20000000	 -2.40000000	  0.00000000	  0.00000000	  2.40000000	 -0.20000000
     [6]	 -0.33333333	 -1.33333333	  0.00000000	  0.00000000	  1.33333333	  0.33333333
     [7]	  0.20000000	 -0.80000000	  0.00000000	 -1.60000000	  0.80000000	  1.40000000
+  CeedBasis in a H^1 space on a triangle element
+    P: 6
+    Q: 4
+    dimension: 2
+    field components: 1
+    qref:     	  0.20000000	  0.60000000	  0.33333333	  0.20000000	  0.20000000	  0.20000000	  0.33333333	  0.60000000
+    qweight:  	  0.26041667	  0.26041667	 -0.28125000	  0.26041667
+    interp:
+      [0]	  0.12000000	  0.48000000	 -0.12000000	  0.48000000	  0.16000000	 -0.12000000
+      [1]	 -0.12000000	  0.48000000	  0.12000000	  0.16000000	  0.48000000	 -0.12000000
+      [2]	 -0.11111111	  0.44444444	 -0.11111111	  0.44444444	  0.44444444	 -0.11111111
+      [3]	 -0.12000000	  0.16000000	 -0.12000000	  0.48000000	  0.48000000	  0.12000000
+    grad:
+      [0]	 -1.40000000	  1.60000000	 -0.20000000	 -0.80000000	  0.80000000	  0.00000000
+      [1]	  0.20000000	 -1.60000000	  1.40000000	 -0.80000000	  0.80000000	  0.00000000
+      [2]	 -0.33333333	  0.00000000	  0.33333333	 -1.33333333	  1.33333333	  0.00000000
+      [3]	  0.20000000	  0.00000000	 -0.20000000	 -2.40000000	  2.40000000	  0.00000000
+      [4]	 -1.40000000	 -0.80000000	  0.00000000	  1.60000000	  0.80000000	 -0.20000000
+      [5]	  0.20000000	 -2.40000000	  0.00000000	  0.00000000	  2.40000000	 -0.20000000
+      [6]	 -0.33333333	 -1.33333333	  0.00000000	  0.00000000	  1.33333333	  0.33333333
+      [7]	  0.20000000	 -0.80000000	  0.00000000	 -1.60000000	  0.80000000	  1.40000000
diff --git a/tests/output/t330-basis.out b/tests/output/t330-basis.out
index 75e93004fc..1377df2bb5 100644
--- a/tests/output/t330-basis.out
+++ b/tests/output/t330-basis.out
@@ -34,3 +34,39 @@ CeedBasis in a H(div) space on a quadrilateral element
     [6]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
     [7]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
     [8]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+  CeedBasis in a H(div) space on a quadrilateral element
+    P: 8
+    Q: 9
+    dimension: 2
+    field components: 1
+    qref:     	 -0.77459667	  0.00000000	  0.77459667	 -0.77459667	  0.00000000	  0.77459667	 -0.77459667	  0.00000000	  0.77459667	 -0.77459667	 -0.77459667	 -0.77459667	  0.00000000	  0.00000000	  0.00000000	  0.77459667	  0.77459667	  0.77459667
+    qweight:  	  0.30864198	  0.49382716	  0.30864198	  0.49382716	  0.79012346	  0.49382716	  0.30864198	  0.49382716	  0.30864198
+    interp:
+      [0]	 -0.05000000	  0.05000000	  0.10000000	  0.01270167	 -0.05000000	  0.05000000	 -0.78729833	 -0.10000000
+      [1]	 -0.12500000	  0.12500000	  0.44364917	  0.05635083	 -0.12500000	  0.12500000	 -0.44364917	 -0.05635083
+      [2]	 -0.05000000	  0.05000000	  0.78729833	  0.10000000	 -0.05000000	  0.05000000	 -0.10000000	 -0.01270167
+      [3]	 -0.05000000	  0.05000000	  0.05635083	  0.05635083	 -0.05000000	  0.05000000	 -0.44364917	 -0.44364917
+      [4]	 -0.12500000	  0.12500000	  0.25000000	  0.25000000	 -0.12500000	  0.12500000	 -0.25000000	 -0.25000000
+      [5]	 -0.05000000	  0.05000000	  0.44364917	  0.44364917	 -0.05000000	  0.05000000	 -0.05635083	 -0.05635083
+      [6]	 -0.05000000	  0.05000000	  0.01270167	  0.10000000	 -0.05000000	  0.05000000	 -0.10000000	 -0.78729833
+      [7]	 -0.12500000	  0.12500000	  0.05635083	  0.44364917	 -0.12500000	  0.12500000	 -0.05635083	 -0.44364917
+      [8]	 -0.05000000	  0.05000000	  0.10000000	  0.78729833	 -0.05000000	  0.05000000	 -0.01270167	 -0.10000000
+      [9]	 -0.78729833	 -0.10000000	 -0.05000000	  0.05000000	  0.10000000	  0.01270167	 -0.05000000	  0.05000000
+      [10]	 -0.44364917	 -0.44364917	 -0.05000000	  0.05000000	  0.05635083	  0.05635083	 -0.05000000	  0.05000000
+      [11]	 -0.10000000	 -0.78729833	 -0.05000000	  0.05000000	  0.01270167	  0.10000000	 -0.05000000	  0.05000000
+      [12]	 -0.44364917	 -0.05635083	 -0.12500000	  0.12500000	  0.44364917	  0.05635083	 -0.12500000	  0.12500000
+      [13]	 -0.25000000	 -0.25000000	 -0.12500000	  0.12500000	  0.25000000	  0.25000000	 -0.12500000	  0.12500000
+      [14]	 -0.05635083	 -0.44364917	 -0.12500000	  0.12500000	  0.05635083	  0.44364917	 -0.12500000	  0.12500000
+      [15]	 -0.10000000	 -0.01270167	 -0.05000000	  0.05000000	  0.78729833	  0.10000000	 -0.05000000	  0.05000000
+      [16]	 -0.05635083	 -0.05635083	 -0.05000000	  0.05000000	  0.44364917	  0.44364917	 -0.05000000	  0.05000000
+      [17]	 -0.01270167	 -0.10000000	 -0.05000000	  0.05000000	  0.10000000	  0.78729833	 -0.05000000	  0.05000000
+    div:
+      [0]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [1]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [2]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [3]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [4]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [5]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [6]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [7]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
+      [8]	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000	  0.25000000
diff --git a/tests/output/t340-basis.out b/tests/output/t340-basis.out
index 5c97bec2a8..fc3b0b9123 100644
--- a/tests/output/t340-basis.out
+++ b/tests/output/t340-basis.out
@@ -19,3 +19,24 @@ CeedBasis in a H(curl) space on a triangle element
     [1]	 -1.20000000	 -1.20000000	 -8.40000000	  1.20000000	  8.40000000	 -1.20000000	 -9.60000000	  9.60000000
     [2]	 -1.20000000	  8.40000000	  1.20000000	 -8.40000000	 -1.20000000	 -1.20000000	  9.60000000	  9.60000000
     [3]	  8.40000000	 -1.20000000	  1.20000000	  1.20000000	 -1.20000000	  8.40000000	  0.00000000	-19.20000000
+  CeedBasis in a H(curl) space on a triangle element
+    P: 8
+    Q: 4
+    dimension: 2
+    field components: 1
+    qref:     	  0.33333333	  0.20000000	  0.20000000	  0.60000000	  0.33333333	  0.20000000	  0.60000000	  0.20000000
+    qweight:  	 -0.26041667	  0.26041667	  0.28125000	  0.26041667
+    interp:
+      [0]	 -0.22222222	  0.44444444	  0.22222222	 -0.44444444	 -0.22222222	 -0.22222222	  2.66666667	  0.00000000
+      [1]	  0.08000000	  0.48000000	  0.56000000	 -0.48000000	  1.04000000	 -0.72000000	  2.24000000	 -0.64000000
+      [2]	  0.24000000	 -0.48000000	 -0.24000000	  0.48000000	 -0.56000000	 -0.56000000	  2.88000000	  0.00000000
+      [3]	 -0.56000000	  0.48000000	 -0.08000000	 -0.48000000	 -0.72000000	  1.04000000	  1.60000000	  0.64000000
+      [4]	 -0.44444444	  0.22222222	 -0.22222222	 -0.22222222	  0.22222222	 -0.44444444	  0.00000000	  2.66666667
+      [5]	 -0.48000000	 -0.08000000	  1.04000000	 -0.72000000	  0.56000000	 -0.48000000	 -0.64000000	  2.24000000
+      [6]	 -0.48000000	  0.56000000	 -0.72000000	  1.04000000	 -0.08000000	 -0.48000000	  0.64000000	  1.60000000
+      [7]	  0.48000000	 -0.24000000	 -0.56000000	 -0.56000000	 -0.24000000	  0.48000000	  0.00000000	  2.88000000
+    curl:
+      [0]	  2.00000000	  2.00000000	 -2.00000000	 -2.00000000	  2.00000000	  2.00000000	  0.00000000	  0.00000000
+      [1]	 -1.20000000	 -1.20000000	 -8.40000000	  1.20000000	  8.40000000	 -1.20000000	 -9.60000000	  9.60000000
+      [2]	 -1.20000000	  8.40000000	  1.20000000	 -8.40000000	 -1.20000000	 -1.20000000	  9.60000000	  9.60000000
+      [3]	  8.40000000	 -1.20000000	  1.20000000	  1.20000000	 -1.20000000	  8.40000000	  0.00000000	-19.20000000
diff --git a/tests/output/t402-qfunction-f.out b/tests/output/t402-qfunction-f.out
index 7163a434f1..dc4e005814 100644
--- a/tests/output/t402-qfunction-f.out
+++ b/tests/output/t402-qfunction-f.out
@@ -26,3 +26,20 @@ User CeedQFunction - mass
       EvalMode: "interpolation"
 CeedQFunctionContext
   Context Data Size: 40
+  User CeedQFunction - mass
+    2 input fields:
+      Input field 0:
+        Name: "qdata"
+        Size: 1
+        EvalMode: "none"
+      Input field 1:
+        Name: "u"
+        Size: 1
+        EvalMode: "interpolation"
+    1 output field:
+      Output field 0:
+        Name: "v"
+        Size: 1
+        EvalMode: "interpolation"
+  CeedQFunctionContext
+    Context Data Size: 40
diff --git a/tests/output/t402-qfunction.out b/tests/output/t402-qfunction.out
index 4d131f7852..ad131d8d36 100644
--- a/tests/output/t402-qfunction.out
+++ b/tests/output/t402-qfunction.out
@@ -27,3 +27,21 @@ User CeedQFunction - mass
 CeedQFunctionContext
   Context Data Size: 40
   Labeled double field: scale
+  User CeedQFunction - mass
+    2 input fields:
+      Input field 0:
+        Name: "q data"
+        Size: 1
+        EvalMode: "none"
+      Input field 1:
+        Name: "u"
+        Size: 1
+        EvalMode: "interpolation"
+    1 output field:
+      Output field 0:
+        Name: "v"
+        Size: 1
+        EvalMode: "interpolation"
+  CeedQFunctionContext
+    Context Data Size: 40
+    Labeled double field: scale
diff --git a/tests/output/t413-qfunction-f.out b/tests/output/t413-qfunction-f.out
index ffee1bdca7..05731e4204 100644
--- a/tests/output/t413-qfunction-f.out
+++ b/tests/output/t413-qfunction-f.out
@@ -28,3 +28,18 @@ Gallery CeedQFunction - MassApply
       Name: "v"
       Size: 1
       EvalMode: "interpolation"
+  Gallery CeedQFunction - MassApply
+    2 input fields:
+      Input field 0:
+        Name: "u"
+        Size: 1
+        EvalMode: "interpolation"
+      Input field 1:
+        Name: "qdata"
+        Size: 1
+        EvalMode: "none"
+    1 output field:
+      Output field 0:
+        Name: "v"
+        Size: 1
+        EvalMode: "interpolation"
diff --git a/tests/output/t413-qfunction.out b/tests/output/t413-qfunction.out
index ffee1bdca7..05731e4204 100644
--- a/tests/output/t413-qfunction.out
+++ b/tests/output/t413-qfunction.out
@@ -28,3 +28,18 @@ Gallery CeedQFunction - MassApply
       Name: "v"
       Size: 1
       EvalMode: "interpolation"
+  Gallery CeedQFunction - MassApply
+    2 input fields:
+      Input field 0:
+        Name: "u"
+        Size: 1
+        EvalMode: "interpolation"
+      Input field 1:
+        Name: "qdata"
+        Size: 1
+        EvalMode: "none"
+    1 output field:
+      Output field 0:
+        Name: "v"
+        Size: 1
+        EvalMode: "interpolation"
diff --git a/tests/output/t504-operator-f.out b/tests/output/t504-operator-f.out
index 3fcc6b0458..3b5857619f 100644
--- a/tests/output/t504-operator-f.out
+++ b/tests/output/t504-operator-f.out
@@ -1,4 +1,4 @@
-CeedOperator
+CeedOperator - setup
   15 elements with 8 quadrature points each
   3 fields
   2 input fields:
@@ -19,23 +19,23 @@ CeedOperator
       EvalMode: none
       No basis
       Active vector
-CeedOperator
-  15 elements with 8 quadrature points each
-  3 fields
-  2 input fields:
-    Input field 0:
-      Name: "rho"
-      Size: 1
-      EvalMode: none
-      No basis
-    Input field 1:
-      Name: "u"
-      Size: 2
-      EvalMode: interpolation
-      Active vector
-  1 output field:
-    Output field 0:
-      Name: "v"
-      Size: 2
-      EvalMode: interpolation
-      Active vector
+  CeedOperator - mass
+    15 elements with 8 quadrature points each
+    3 fields
+    2 input fields:
+      Input field 0:
+        Name: "rho"
+        Size: 1
+        EvalMode: none
+        No basis
+      Input field 1:
+        Name: "u"
+        Size: 2
+        EvalMode: interpolation
+        Active vector
+    1 output field:
+      Output field 0:
+        Name: "v"
+        Size: 2
+        EvalMode: interpolation
+        Active vector
diff --git a/tests/output/t504-operator.out b/tests/output/t504-operator.out
index 3fcc6b0458..4f23570743 100644
--- a/tests/output/t504-operator.out
+++ b/tests/output/t504-operator.out
@@ -1,4 +1,5 @@
-CeedOperator
+CeedOperator - setup
+CeedOperator - setup
   15 elements with 8 quadrature points each
   3 fields
   2 input fields:
@@ -19,23 +20,24 @@ CeedOperator
       EvalMode: none
       No basis
       Active vector
-CeedOperator
-  15 elements with 8 quadrature points each
-  3 fields
-  2 input fields:
-    Input field 0:
-      Name: "rho"
-      Size: 1
-      EvalMode: none
-      No basis
-    Input field 1:
-      Name: "u"
-      Size: 2
-      EvalMode: interpolation
-      Active vector
-  1 output field:
-    Output field 0:
-      Name: "v"
-      Size: 2
-      EvalMode: interpolation
-      Active vector
+  CeedOperator - mass
+  CeedOperator - mass
+    15 elements with 8 quadrature points each
+    3 fields
+    2 input fields:
+      Input field 0:
+        Name: "rho"
+        Size: 1
+        EvalMode: none
+        No basis
+      Input field 1:
+        Name: "u"
+        Size: 2
+        EvalMode: interpolation
+        Active vector
+    1 output field:
+      Output field 0:
+        Name: "v"
+        Size: 2
+        EvalMode: interpolation
+        Active vector
diff --git a/tests/output/t523-operator-f.out b/tests/output/t523-operator-f.out
index 1817a8a2cf..2a17d484bb 100644
--- a/tests/output/t523-operator-f.out
+++ b/tests/output/t523-operator-f.out
@@ -39,44 +39,44 @@ Composite CeedOperator - setup
         Size: 1
         EvalMode: none
         No basis
-Composite CeedOperator - mass
-  SubOperator 0 - triangle elements:
-    6 elements with 4 quadrature points each
-    3 fields
-    2 input fields:
-      Input field 0:
-        Name: "rho"
-        Size: 1
-        EvalMode: none
-        No basis
-      Input field 1:
-        Name: "u"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
-    1 output field:
-      Output field 0:
-        Name: "v"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
-  SubOperator 1 - quadrilateral elements:
-    6 elements with 16 quadrature points each
-    3 fields
-    2 input fields:
-      Input field 0:
-        Name: "rho"
-        Size: 1
-        EvalMode: none
-        No basis
-      Input field 1:
-        Name: "u"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
-    1 output field:
-      Output field 0:
-        Name: "v"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
+  Composite CeedOperator - mass
+    SubOperator 0 - triangle elements:
+      6 elements with 4 quadrature points each
+      3 fields
+      2 input fields:
+        Input field 0:
+          Name: "rho"
+          Size: 1
+          EvalMode: none
+          No basis
+        Input field 1:
+          Name: "u"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
+      1 output field:
+        Output field 0:
+          Name: "v"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
+    SubOperator 1 - quadrilateral elements:
+      6 elements with 16 quadrature points each
+      3 fields
+      2 input fields:
+        Input field 0:
+          Name: "rho"
+          Size: 1
+          EvalMode: none
+          No basis
+        Input field 1:
+          Name: "u"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
+      1 output field:
+        Output field 0:
+          Name: "v"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
diff --git a/tests/output/t523-operator.out b/tests/output/t523-operator.out
index 1817a8a2cf..742f6954a7 100644
--- a/tests/output/t523-operator.out
+++ b/tests/output/t523-operator.out
@@ -1,3 +1,6 @@
+Composite CeedOperator - setup
+  SubOperator 0 - triangle elements
+  SubOperator 1 - quadrilateral elements
 Composite CeedOperator - setup
   SubOperator 0 - triangle elements:
     6 elements with 4 quadrature points each
@@ -39,44 +42,47 @@ Composite CeedOperator - setup
         Size: 1
         EvalMode: none
         No basis
-Composite CeedOperator - mass
-  SubOperator 0 - triangle elements:
-    6 elements with 4 quadrature points each
-    3 fields
-    2 input fields:
-      Input field 0:
-        Name: "rho"
-        Size: 1
-        EvalMode: none
-        No basis
-      Input field 1:
-        Name: "u"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
-    1 output field:
-      Output field 0:
-        Name: "v"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
-  SubOperator 1 - quadrilateral elements:
-    6 elements with 16 quadrature points each
-    3 fields
-    2 input fields:
-      Input field 0:
-        Name: "rho"
-        Size: 1
-        EvalMode: none
-        No basis
-      Input field 1:
-        Name: "u"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
-    1 output field:
-      Output field 0:
-        Name: "v"
-        Size: 1
-        EvalMode: interpolation
-        Active vector
+  Composite CeedOperator - mass
+    SubOperator 0 - triangle elements
+    SubOperator 1 - quadrilateral elements
+  Composite CeedOperator - mass
+    SubOperator 0 - triangle elements:
+      6 elements with 4 quadrature points each
+      3 fields
+      2 input fields:
+        Input field 0:
+          Name: "rho"
+          Size: 1
+          EvalMode: none
+          No basis
+        Input field 1:
+          Name: "u"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
+      1 output field:
+        Output field 0:
+          Name: "v"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
+    SubOperator 1 - quadrilateral elements:
+      6 elements with 16 quadrature points each
+      3 fields
+      2 input fields:
+        Input field 0:
+          Name: "rho"
+          Size: 1
+          EvalMode: none
+          No basis
+        Input field 1:
+          Name: "u"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
+      1 output field:
+        Output field 0:
+          Name: "v"
+          Size: 1
+          EvalMode: interpolation
+          Active vector
diff --git a/tests/t003-ceed-f.f90 b/tests/t003-ceed-f.f90
index 61c00b3535..00147b869d 100644
--- a/tests/t003-ceed-f.f90
+++ b/tests/t003-ceed-f.f90
@@ -12,6 +12,9 @@ program test
 
       call ceedview(ceed,err)
 
+      call ceedsetnumviewtabs(ceed,1,err)
+      call ceedview(ceed,err)
+
       call ceeddestroy(ceed,err)
 
       end
diff --git a/tests/t003-ceed.c b/tests/t003-ceed.c
index 813c0cfe49..9e323c6d49 100644
--- a/tests/t003-ceed.c
+++ b/tests/t003-ceed.c
@@ -11,6 +11,18 @@ int main(int argc, char **argv) {
 
   CeedView(ceed, stdout);
 
+  CeedSetNumViewTabs(ceed, 1);
+  CeedView(ceed, stdout);
+
+  // Check CeedObject interface
+  {
+    Ceed ceed_copy = NULL;
+
+    CeedReferenceCopy(ceed, &ceed_copy);
+    CeedObjectView((CeedObject)ceed_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&ceed_copy);
+  }
+
   CeedDestroy(&ceed);
   return 0;
 }
diff --git a/tests/t008-ceed.c b/tests/t008-ceed.c
index 24b3fecff6..344b341ae7 100644
--- a/tests/t008-ceed.c
+++ b/tests/t008-ceed.c
@@ -11,7 +11,7 @@ int main(int argc, char **argv) {
   sprintf(help_resource, "help:%s", argv[1]);
 
   CeedInit(help_resource, &ceed);
-  CeedDestroy(&ceed);
 
+  CeedDestroy(&ceed);
   return 0;
 }
diff --git a/tests/t010-config.c b/tests/t010-config.c
new file mode 100644
index 0000000000..1becbc6ddd
--- /dev/null
+++ b/tests/t010-config.c
@@ -0,0 +1,14 @@
+/// @file
+/// Test git version and build configuration
+/// \test Test git version and build configuration
+#include <ceed.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  const char *git_version, *build_config;
+  CeedGetGitVersion(&git_version);
+  CeedGetBuildConfiguration(&build_config);
+  // printf("Git: %s\n", git_version);
+  // puts(build_config);
+  return 0;
+}
diff --git a/tests/t107-vector-f.f90 b/tests/t107-vector-f.f90
index 44531fe72b..51c2b79ff5 100644
--- a/tests/t107-vector-f.f90
+++ b/tests/t107-vector-f.f90
@@ -25,6 +25,9 @@ program test
 
       call ceedvectorview(x,err)
 
+      call ceedvectorsetnumviewtabs(x,1,err)
+      call ceedvectorview(x,err)
+
       call ceedvectordestroy(x,err)
       call ceeddestroy(ceed,err)
 
diff --git a/tests/t107-vector.c b/tests/t107-vector.c
index ffa27a508d..b6f3eb38df 100644
--- a/tests/t107-vector.c
+++ b/tests/t107-vector.c
@@ -17,6 +17,16 @@ int main(int argc, char **argv) {
 
   CeedVectorView(x, "%12.8f", stdout);
 
+  // Check tabs and CeedObject functionality
+  {
+    CeedVector x_copy = NULL;
+
+    CeedVectorReferenceCopy(x, &x_copy);
+    CeedVectorSetNumViewTabs(x_copy, 1);
+    CeedObjectView((CeedObject)x_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&x_copy);
+  }
+
   CeedVectorDestroy(&x);
   CeedDestroy(&ceed);
   return 0;
diff --git a/tests/t127-vector.c b/tests/t127-vector.c
new file mode 100644
index 0000000000..e13bf15d1b
--- /dev/null
+++ b/tests/t127-vector.c
@@ -0,0 +1,60 @@
+/// @file
+/// Test strided setting and copying of vectors
+/// \test Test strided setting and copying of vectors
+#include <ceed.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed       ceed;
+  CeedSize   start = 2, step = 3;
+  CeedVector x, y;
+  CeedInt    len = 10;
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, len, &x);
+  CeedVectorCreate(ceed, len, &y);
+
+  // Set strided
+  CeedVectorSetValue(x, 1.0);
+  CeedVectorSetValueStrided(x, start, -1, step, 42.0);
+  {
+    const CeedScalar *read_array;
+
+    CeedVectorGetArrayRead(x, CEED_MEM_HOST, &read_array);
+    for (CeedInt i = 0; i < len; i++) {
+      CeedScalar value = (i - start) % step == 0 ? 42.0 : 1.0;
+
+      if (read_array[i] != value) {
+        // LCOV_EXCL_START
+        printf("Error in setting value in x at index %" CeedInt_FMT ", computed: %f actual: %f\n", i, read_array[i], value);
+        // LCOV_EXCL_STOP
+      }
+    }
+    CeedVectorRestoreArrayRead(x, &read_array);
+  }
+
+  // Copy strided
+  CeedVectorSetValue(y, 0.0);
+  CeedVectorCopyStrided(x, start, -1, step, y);
+  {
+    const CeedScalar *read_array;
+
+    CeedVectorGetArrayRead(y, CEED_MEM_HOST, &read_array);
+    for (CeedInt i = 0; i < len; i++) {
+      CeedScalar value = (i - start) % step == 0 ? 42.0 : 0.0;
+
+      if (read_array[i] != value) {
+        // LCOV_EXCL_START
+        printf("Error in copying value to y at index %" CeedInt_FMT ", computed: %f actual: %f\n", i, read_array[i], value);
+        // LCOV_EXCL_STOP
+      }
+    }
+    CeedVectorRestoreArrayRead(y, &read_array);
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&y);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t128-vector.c b/tests/t128-vector.c
new file mode 100644
index 0000000000..037b482cbe
--- /dev/null
+++ b/tests/t128-vector.c
@@ -0,0 +1,51 @@
+/// @file
+/// Test copying into vector with borrowed pointer
+/// \test Test copying into vector with borrowed pointer
+#include <ceed.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed       ceed;
+  CeedVector x, x_copy;
+  CeedInt    len = 10;
+  CeedScalar array_borrowed[len];
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, len, &x);
+  CeedVectorCreate(ceed, len, &x_copy);
+
+  {
+    CeedScalar array[len];
+
+    for (CeedInt i = 0; i < len; i++) {
+      array[i]          = i;
+      array_borrowed[i] = 10 + i;
+    }
+
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
+    CeedVectorSetArray(x_copy, CEED_MEM_HOST, CEED_USE_POINTER, array_borrowed);
+  }
+
+  // Copy to device if preferred
+  {
+    CeedMemType mem_type = CEED_MEM_HOST;
+
+    CeedGetPreferredMemType(ceed, &mem_type);
+    if (mem_type == CEED_MEM_DEVICE) CeedVectorSyncArray(x, CEED_MEM_DEVICE);
+  }
+
+  // Copy and sync borrowed array
+  CeedVectorCopy(x, x_copy);
+  CeedVectorSyncArray(x_copy, CEED_MEM_HOST);
+
+  // Check that borrowed array is the same as the original input array a
+  for (CeedInt i = 0; i < len; i++) {
+    if (array_borrowed[i] != i) printf("Error in copying values of CeedVector\n");
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&x_copy);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t130-vector.c b/tests/t130-vector.c
new file mode 100644
index 0000000000..d223a1ad06
--- /dev/null
+++ b/tests/t130-vector.c
@@ -0,0 +1,44 @@
+/// @file
+/// Test getting and restoring work vectors
+/// \test Test getting and restoring work vectors
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  // Check for getting the same work vector back
+  {
+    CeedVector x, y;
+
+    CeedGetWorkVector(ceed, 20, &x);
+    // Do not do this!
+    CeedVector x_copy = x;
+
+    CeedRestoreWorkVector(ceed, &x);
+    CeedGetWorkVector(ceed, 20, &y);
+    if (y != x_copy) printf("failed to return same work vector");
+    CeedRestoreWorkVector(ceed, &y);
+  }
+
+  // Check for getting a new work vector back
+  {
+    CeedVector x, y;
+
+    CeedGetWorkVector(ceed, 20, &x);
+    // Do not do this!
+    CeedVector x_copy = x;
+
+    CeedRestoreWorkVector(ceed, &x);
+    CeedGetWorkVector(ceed, 30, &y);
+    if (y == x_copy) printf("failed to return new work vector");
+    CeedRestoreWorkVector(ceed, &y);
+  }
+
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t131-vector.c b/tests/t131-vector.c
new file mode 100644
index 0000000000..3fe78c6b94
--- /dev/null
+++ b/tests/t131-vector.c
@@ -0,0 +1,58 @@
+/// @file
+/// Test clearing work vectors
+/// \test Test clearing work vectors
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <math.h>
+#include <stdio.h>
+
+static CeedScalar expected_usage(CeedSize length) { return length * sizeof(CeedScalar) * 1e-6; }
+
+int main(int argc, char **argv) {
+  Ceed       ceed;
+  CeedVector x, y, z;
+  CeedScalar usage_mb;
+
+  CeedInit(argv[1], &ceed);
+
+  // Add work vectors of different lengths
+  CeedGetWorkVector(ceed, 10, &x);
+  CeedGetWorkVector(ceed, 20, &y);
+  CeedGetWorkVector(ceed, 30, &z);
+
+  // Check memory usage, should be 60 * sizeof(CeedScalar)
+  CeedGetWorkVectorMemoryUsage(ceed, &usage_mb);
+  if (fabs(usage_mb - expected_usage(60)) > 100. * CEED_EPSILON) printf("Wrong usage: %0.8g MB != %0.8g MB\n", usage_mb, expected_usage(60));
+
+  // Restore x and z
+  CeedRestoreWorkVector(ceed, &x);
+  CeedRestoreWorkVector(ceed, &z);
+
+  // Clear work vectors with length < 30. This should:
+  //  - Remove x
+  //  - Leave y, since it is still in use
+  //  - Leave z, since it is length 30
+  CeedClearWorkVectors(ceed, 30);
+  CeedGetWorkVectorMemoryUsage(ceed, &usage_mb);
+  if (fabs(usage_mb - expected_usage(50)) > 100. * CEED_EPSILON) printf("Wrong usage: %0.8g MB != %0.8g MB\n", usage_mb, expected_usage(50));
+
+  // Clear work vectors with length < 31. This should:
+  //  - Leave y, since it is still in use
+  //  - Remove z
+  CeedClearWorkVectors(ceed, 31);
+  CeedGetWorkVectorMemoryUsage(ceed, &usage_mb);
+  if (fabs(usage_mb - expected_usage(20)) > 100. * CEED_EPSILON) printf("Wrong usage: %0.8g MB != %0.8g MB\n", usage_mb, expected_usage(20));
+
+  // Restore y
+  CeedRestoreWorkVector(ceed, &y);
+
+  // Make sure we can still get back y without allocating a new work vector
+  CeedGetWorkVector(ceed, 20, &y);
+  CeedGetWorkVectorMemoryUsage(ceed, &usage_mb);
+  if (fabs(usage_mb - expected_usage(20)) > 100. * CEED_EPSILON) printf("Wrong usage: %0.8g MB != %0.8g MB\n", usage_mb, expected_usage(20));
+  CeedRestoreWorkVector(ceed, &y);
+
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t210-elemrestriction-f.f90 b/tests/t210-elemrestriction-f.f90
index b22c4fe5d8..7bad3f941a 100644
--- a/tests/t210-elemrestriction-f.f90
+++ b/tests/t210-elemrestriction-f.f90
@@ -27,6 +27,9 @@ program test
 
       call ceedelemrestrictionview(r,err)
 
+      call ceedelemrestrictionsetnumviewtabs(r,1,err)
+      call ceedelemrestrictionview(r,err)
+
       call ceedelemrestrictiondestroy(r,err)
       call ceeddestroy(ceed,err)
 
diff --git a/tests/t210-elemrestriction.c b/tests/t210-elemrestriction.c
index 7aff301411..1cefd2d185 100644
--- a/tests/t210-elemrestriction.c
+++ b/tests/t210-elemrestriction.c
@@ -19,6 +19,16 @@ int main(int argc, char **argv) {
 
   CeedElemRestrictionView(elem_restriction, stdout);
 
+  // Check tabs and CeedObject functionality
+  {
+    CeedElemRestriction elem_restriction_copy = NULL;
+
+    CeedElemRestrictionReferenceCopy(elem_restriction, &elem_restriction_copy);
+    CeedElemRestrictionSetNumViewTabs(elem_restriction_copy, 1);
+    CeedObjectView((CeedObject)elem_restriction_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&elem_restriction_copy);
+  }
+
   CeedElemRestrictionDestroy(&elem_restriction);
   CeedDestroy(&ceed);
   return 0;
diff --git a/tests/t211-elemrestriction-f.f90 b/tests/t211-elemrestriction-f.f90
index 6d86c9c685..4cc27845fd 100644
--- a/tests/t211-elemrestriction-f.f90
+++ b/tests/t211-elemrestriction-f.f90
@@ -20,6 +20,9 @@ program test
 
       call ceedelemrestrictionview(r,err)
 
+      call ceedelemrestrictionsetnumviewtabs(r,1,err)
+      call ceedelemrestrictionview(r,err)
+
       call ceedelemrestrictiondestroy(r,err)
       call ceeddestroy(ceed,err)
 
diff --git a/tests/t211-elemrestriction.c b/tests/t211-elemrestriction.c
index 55ba2de881..3318a56f18 100644
--- a/tests/t211-elemrestriction.c
+++ b/tests/t211-elemrestriction.c
@@ -14,6 +14,8 @@ int main(int argc, char **argv) {
   CeedInt strides[3] = {1, 2, 2};
   CeedElemRestrictionCreateStrided(ceed, num_elem, 2, 1, num_elem * 2, strides, &elem_restriction);
 
+  CeedElemRestrictionView(elem_restriction, stdout);
+  CeedElemRestrictionSetNumViewTabs(elem_restriction, 1);
   CeedElemRestrictionView(elem_restriction, stdout);
 
   CeedElemRestrictionDestroy(&elem_restriction);
diff --git a/tests/t212-elemrestriction-f.f90 b/tests/t212-elemrestriction-f.f90
index b36f7c2ea3..9d1341052a 100644
--- a/tests/t212-elemrestriction-f.f90
+++ b/tests/t212-elemrestriction-f.f90
@@ -21,6 +21,9 @@ program test
 
       call ceedelemrestrictionview(r,err)
 
+      call ceedelemrestrictionsetnumviewtabs(r,1,err)
+      call ceedelemrestrictionview(r,err)
+
       call ceedelemrestrictiondestroy(r,err)
       call ceeddestroy(ceed,err)
 
diff --git a/tests/t212-elemrestriction.c b/tests/t212-elemrestriction.c
index 99f5dc1cea..3914727201 100644
--- a/tests/t212-elemrestriction.c
+++ b/tests/t212-elemrestriction.c
@@ -14,6 +14,8 @@ int main(int argc, char **argv) {
   CeedInt strides[3] = {1, 2, 2};
   CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, 2, 2, 1, num_elem * 2, strides, &elem_restriction);
 
+  CeedElemRestrictionView(elem_restriction, stdout);
+  CeedElemRestrictionSetNumViewTabs(elem_restriction, 1);
   CeedElemRestrictionView(elem_restriction, stdout);
 
   CeedElemRestrictionDestroy(&elem_restriction);
diff --git a/tests/t217-elemrestriction.c b/tests/t217-elemrestriction.c
index ca4f62a048..b9c52c52eb 100644
--- a/tests/t217-elemrestriction.c
+++ b/tests/t217-elemrestriction.c
@@ -55,10 +55,11 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(x, CEED_MEM_HOST, &x_array);
     for (CeedInt i = 0; i < num_elem + 1; i++) {
-      if (x_array[i] != (10 + i) * (i > 0 && i < num_elem ? 2.0 : 1.0))
+      if (x_array[i] != (10 + i) * (i > 0 && i < num_elem ? 2.0 : 1.0)) {
         // LCOV_EXCL_START
         printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", i, (CeedScalar)x_array[i]);
-      // LCOV_EXCL_STOP
+        // LCOV_EXCL_STOP
+      }
     }
     CeedVectorRestoreArrayRead(x, &x_array);
   }
diff --git a/tests/t231-elemrestriction.c b/tests/t231-elemrestriction.c
index de6cd2466e..21077001aa 100644
--- a/tests/t231-elemrestriction.c
+++ b/tests/t231-elemrestriction.c
@@ -2,6 +2,7 @@
 /// Test creation, use, and destruction of an element restriction at points
 /// \test Test creation, use, and destruction of an element restriction at points
 #include <ceed.h>
+#include <ceed/backend.h>
 #include <stdio.h>
 
 int main(int argc, char **argv) {
@@ -13,57 +14,61 @@ int main(int argc, char **argv) {
 
   CeedInit(argv[1], &ceed);
 
-  CeedVectorCreate(ceed, num_points, &x);
   {
-    CeedInt    point_index = num_elem;
-    CeedScalar array[num_points];
+    CeedInt offset      = num_elem + 1;
+    CeedInt point_index = num_elem;
 
     for (CeedInt i = 0; i < num_elem; i++) {
       CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
 
+      ind[i] = offset;
       for (CeedInt j = 0; j < num_points_in_elem; j++) {
-        array[point_index] = i;
-        point_index        = (point_index + 1) % num_points;
+        ind[offset + j] = point_index;
+        point_index     = (point_index + 1) % num_points;
       }
+      offset += num_points_in_elem;
     }
-    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
+    ind[num_elem] = offset;
   }
-  CeedVectorCreate(ceed, num_points, &y);
+  CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
 
+  CeedElemRestrictionCreateVector(elem_restriction, &x, &y);
+  CeedVectorSetValue(y, 0.0);
   {
-    CeedInt offset      = num_elem + 1;
-    CeedInt point_index = num_elem;
+    CeedInt    point_index = num_elem;
+    CeedScalar array[num_points];
 
     for (CeedInt i = 0; i < num_elem; i++) {
       CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
 
-      ind[i] = offset;
       for (CeedInt j = 0; j < num_points_in_elem; j++) {
-        ind[offset + j] = point_index;
-        point_index     = (point_index + 1) % num_points;
+        array[point_index] = i;
+        point_index        = (point_index + 1) % num_points;
       }
-      offset += num_points_in_elem;
     }
-    ind[num_elem] = offset;
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
   }
-  CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
+
   CeedElemRestrictionApply(elem_restriction, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE);
   {
-    CeedInt           index = 0;
+    CeedInt           e_layout[3];
     const CeedScalar *read_array;
 
     CeedVectorGetArrayRead(y, CEED_MEM_HOST, &read_array);
+    CeedElemRestrictionGetELayout(elem_restriction, e_layout);
 
     for (CeedInt i = 0; i < num_elem; i++) {
-      CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
+      CeedSize      elem_offset        = 0;
+      const CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
 
+      CeedElemRestrictionGetAtPointsElementOffset(elem_restriction, i, &elem_offset);
       for (CeedInt j = 0; j < num_points_in_elem; j++) {
-        if (i != read_array[index]) {
+        if (i != read_array[elem_offset + j * e_layout[0]]) {
           // LCOV_EXCL_START
-          printf("Error in restricted array y[%" CeedInt_FMT "] = %f\n", index, (CeedScalar)read_array[i]);
+          printf("Error in restricted array y[%" CeedInt_FMT "] = %f != %f\n", (CeedInt)elem_offset + j * e_layout[0],
+                 (CeedScalar)read_array[elem_offset + j * e_layout[0]], (CeedScalar)i);
           // LCOV_EXCL_STOP
         }
-        index++;
       }
     }
     CeedVectorRestoreArrayRead(y, &read_array);
diff --git a/tests/t232-elemrestriction.c b/tests/t232-elemrestriction.c
index 66557dba28..7632875fcf 100644
--- a/tests/t232-elemrestriction.c
+++ b/tests/t232-elemrestriction.c
@@ -1,7 +1,8 @@
 /// @file
-/// Test creation, use, and destruction of an element restriction at points for single elements
-/// \test Test creation, use, and destruction of an element restriction at points for single elements
+/// Test creation, use, and destruction of an element restriction at points
+/// \test Test creation, use, and destruction of an element restriction at points
 #include <ceed.h>
+#include <ceed/backend.h>
 #include <stdio.h>
 
 int main(int argc, char **argv) {
@@ -13,22 +14,6 @@ int main(int argc, char **argv) {
 
   CeedInit(argv[1], &ceed);
 
-  CeedVectorCreate(ceed, num_points, &x);
-  {
-    CeedInt    point_index = num_elem;
-    CeedScalar array[num_points];
-
-    for (CeedInt i = 0; i < num_elem; i++) {
-      CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
-
-      for (CeedInt j = 0; j < num_points_in_elem; j++) {
-        array[point_index] = i;
-        point_index        = (point_index + 1) % num_points;
-      }
-    }
-    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
-  }
-
   {
     CeedInt offset      = num_elem + 1;
     CeedInt point_index = num_elem;
@@ -47,30 +32,43 @@ int main(int argc, char **argv) {
   }
   CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
 
+  CeedElemRestrictionCreateVector(elem_restriction, &x, &y);
+  CeedVectorSetValue(y, 0.0);
   {
-    CeedInt max_points;
+    CeedInt    point_index = num_elem;
+    CeedScalar array[num_points];
 
-    CeedElemRestrictionGetMaxPointsInElement(elem_restriction, &max_points);
-    CeedVectorCreate(ceed, max_points, &y);
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
+
+      for (CeedInt j = 0; j < num_points_in_elem; j++) {
+        array[point_index] = i;
+        point_index        = (point_index + 1) % num_points;
+      }
+    }
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
   }
 
+  CeedElemRestrictionApply(elem_restriction, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE);
+  CeedElemRestrictionApply(elem_restriction, CEED_TRANSPOSE, y, x, CEED_REQUEST_IMMEDIATE);
   {
-    for (CeedInt i = 0; i < num_elem; i++) {
-      CeedInt           num_points_in_elem = (i + 1) % num_elem + 1;
-      const CeedScalar *read_array;
+    CeedInt           point_index = num_elem;
+    const CeedScalar *read_array;
 
-      CeedElemRestrictionApplyAtPointsInElement(elem_restriction, i, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE);
-      CeedVectorGetArrayRead(y, CEED_MEM_HOST, &read_array);
+    CeedVectorGetArrayRead(x, CEED_MEM_HOST, &read_array);
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
 
       for (CeedInt j = 0; j < num_points_in_elem; j++) {
-        if (i != read_array[j]) {
+        if (read_array[point_index] != 2 * i) {
           // LCOV_EXCL_START
-          printf("Error in restricted element array %" CeedInt_FMT " y[%" CeedInt_FMT "] = %f\n", i, j, (CeedScalar)read_array[j]);
+          printf("Error in restricted array x[%" CeedInt_FMT "] = %f != %f\n", point_index, read_array[point_index], 2.0 * i);
           // LCOV_EXCL_STOP
         }
+        point_index = (point_index + 1) % num_points;
       }
-      CeedVectorRestoreArrayRead(y, &read_array);
     }
+    CeedVectorRestoreArrayRead(x, &read_array);
   }
 
   CeedVectorDestroy(&x);
diff --git a/tests/t233-elemrestriction.c b/tests/t233-elemrestriction.c
index 1ad395b4d1..3573e1c349 100644
--- a/tests/t233-elemrestriction.c
+++ b/tests/t233-elemrestriction.c
@@ -1,8 +1,7 @@
 /// @file
-/// Test creation, transpose use, and destruction of an element restriction at points for single elements
-/// \test Test creation, transpose use, and destruction of an element restriction at points for single elements
+/// Test creation, use, and destruction of an element restriction at points for single elements
+/// \test Test creation, use, and destruction of an element restriction at points for single elements
 #include <ceed.h>
-#include <math.h>
 #include <stdio.h>
 
 int main(int argc, char **argv) {
@@ -14,9 +13,6 @@ int main(int argc, char **argv) {
 
   CeedInit(argv[1], &ceed);
 
-  CeedVectorCreate(ceed, num_points, &x);
-  CeedVectorSetValue(x, 0.0);
-
   {
     CeedInt offset      = num_elem + 1;
     CeedInt point_index = num_elem;
@@ -33,38 +29,53 @@ int main(int argc, char **argv) {
     }
     ind[num_elem] = offset;
   }
-  CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind, &elem_restriction);
+  CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_USE_POINTER, ind, &elem_restriction);
+
+  CeedElemRestrictionCreateVector(elem_restriction, &x, NULL);
+  {
+    CeedInt    point_index = num_elem;
+    CeedScalar array[num_points];
+
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
+
+      for (CeedInt j = 0; j < num_points_in_elem; j++) {
+        array[point_index] = i;
+        point_index        = (point_index + 1) % num_points;
+      }
+    }
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, array);
+  }
 
   {
-    CeedInt max_points;
+    CeedInt min_points, max_points;
 
+    CeedElemRestrictionGetMinPointsInElement(elem_restriction, &min_points);
     CeedElemRestrictionGetMaxPointsInElement(elem_restriction, &max_points);
+    if (min_points != 1 || max_points != num_elem) {
+      // LCOV_EXCL_START
+      printf("Error in min/max points: min %" CeedInt_FMT " max %" CeedInt_FMT "\n", min_points, max_points);
+      // LCOV_EXCL_STOP
+    }
     CeedVectorCreate(ceed, max_points, &y);
-    CeedVectorSetValue(y, 1.0);
   }
 
   {
     for (CeedInt i = 0; i < num_elem; i++) {
-      CeedInt           point_index = num_elem;
+      CeedInt           num_points_in_elem = (i + 1) % num_elem + 1;
       const CeedScalar *read_array;
 
-      CeedVectorSetValue(x, 0.0);
-      CeedElemRestrictionApplyAtPointsInElement(elem_restriction, i, CEED_TRANSPOSE, y, x, CEED_REQUEST_IMMEDIATE);
-
-      CeedVectorGetArrayRead(x, CEED_MEM_HOST, &read_array);
-      for (CeedInt j = 0; j < num_elem; j++) {
-        CeedInt num_points_in_elem = (j + 1) % num_elem + 1;
+      CeedElemRestrictionApplyAtPointsInElement(elem_restriction, i, CEED_NOTRANSPOSE, x, y, CEED_REQUEST_IMMEDIATE);
+      CeedVectorGetArrayRead(y, CEED_MEM_HOST, &read_array);
 
-        for (CeedInt k = 0; k < num_points_in_elem; k++) {
-          if (fabs(read_array[point_index] - (i == j ? 1.0 : 0.0)) > 10 * CEED_EPSILON) {
-            // LCOV_EXCL_START
-            printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", point_index, (CeedScalar)read_array[point_index]);
-            // LCOV_EXCL_STOP
-          }
-          point_index = (point_index + 1) % num_points;
+      for (CeedInt j = 0; j < num_points_in_elem; j++) {
+        if (i != read_array[j]) {
+          // LCOV_EXCL_START
+          printf("Error in restricted element array %" CeedInt_FMT " y[%" CeedInt_FMT "] = %f\n", i, j, (CeedScalar)read_array[j]);
+          // LCOV_EXCL_STOP
         }
       }
-      CeedVectorRestoreArrayRead(x, &read_array);
+      CeedVectorRestoreArrayRead(y, &read_array);
     }
   }
 
diff --git a/tests/t234-elemrestriction.c b/tests/t234-elemrestriction.c
new file mode 100644
index 0000000000..3f434bd365
--- /dev/null
+++ b/tests/t234-elemrestriction.c
@@ -0,0 +1,81 @@
+/// @file
+/// Test creation, transpose use, and destruction of an element restriction at points for single elements
+/// \test Test creation, transpose use, and destruction of an element restriction at points for single elements
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed                ceed;
+  CeedInt             num_elem = 3, num_points = num_elem * 2;
+  CeedInt             ind[(num_elem + 1) + num_points];
+  CeedVector          x, y;
+  CeedElemRestriction elem_restriction;
+
+  CeedInit(argv[1], &ceed);
+
+  {
+    CeedInt offset      = num_elem + 1;
+    CeedInt point_index = num_elem;
+
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt num_points_in_elem = (i + 1) % num_elem + 1;
+
+      ind[i] = offset;
+      for (CeedInt j = 0; j < num_points_in_elem; j++) {
+        ind[offset + j] = point_index;
+        point_index     = (point_index + 1) % num_points;
+      }
+      offset += num_points_in_elem;
+    }
+    ind[num_elem] = offset;
+  }
+  CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind, &elem_restriction);
+
+  CeedElemRestrictionCreateVector(elem_restriction, &x, NULL);
+  CeedVectorSetValue(x, 0.0);
+  {
+    CeedInt min_points, max_points;
+
+    CeedElemRestrictionGetMinPointsInElement(elem_restriction, &min_points);
+    CeedElemRestrictionGetMaxPointsInElement(elem_restriction, &max_points);
+    if (min_points != 1 || max_points != num_elem) {
+      // LCOV_EXCL_START
+      printf("Error in min/max points: min %" CeedInt_FMT " max %" CeedInt_FMT "\n", min_points, max_points);
+      // LCOV_EXCL_STOP
+    }
+    CeedVectorCreate(ceed, max_points, &y);
+    CeedVectorSetValue(y, 1.0);
+  }
+
+  {
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt           point_index = num_elem;
+      const CeedScalar *read_array;
+
+      CeedVectorSetValue(x, 0.0);
+      CeedElemRestrictionApplyAtPointsInElement(elem_restriction, i, CEED_TRANSPOSE, y, x, CEED_REQUEST_IMMEDIATE);
+
+      CeedVectorGetArrayRead(x, CEED_MEM_HOST, &read_array);
+      for (CeedInt j = 0; j < num_elem; j++) {
+        CeedInt num_points_in_elem = (j + 1) % num_elem + 1;
+
+        for (CeedInt k = 0; k < num_points_in_elem; k++) {
+          if (fabs(read_array[point_index] - (i == j ? 1.0 : 0.0)) > 10 * CEED_EPSILON) {
+            // LCOV_EXCL_START
+            printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", point_index, (CeedScalar)read_array[point_index]);
+            // LCOV_EXCL_STOP
+          }
+          point_index = (point_index + 1) % num_points;
+        }
+      }
+      CeedVectorRestoreArrayRead(x, &read_array);
+    }
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&y);
+  CeedElemRestrictionDestroy(&elem_restriction);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t300-basis-f.f90 b/tests/t300-basis-f.f90
index 1397a5a403..6aef5a0c34 100644
--- a/tests/t300-basis-f.f90
+++ b/tests/t300-basis-f.f90
@@ -18,7 +18,10 @@ program test
 
       call ceedbasiscreatetensorh1lagrange(ceed,1,1,4,4,ceed_gauss,b,err)
       call ceedbasisview(b,err)
+      call ceedbasissetnumviewtabs(b,1,err)
+      call ceedbasisview(b,err)
       call ceedbasisdestroy(b,err)
+
       call ceeddestroy(ceed,err)
 
       end
diff --git a/tests/t300-basis.c b/tests/t300-basis.c
index db17332def..d340be94e3 100644
--- a/tests/t300-basis.c
+++ b/tests/t300-basis.c
@@ -18,8 +18,18 @@ int main(int argc, char **argv) {
 
   CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 4, 4, CEED_GAUSS, &basis);
   CeedBasisView(basis, stdout);
-  CeedBasisDestroy(&basis);
 
+  // Check tabs and CeedObject functionality
+  {
+    CeedBasis basis_copy = NULL;
+
+    CeedBasisReferenceCopy(basis, &basis_copy);
+    CeedBasisSetNumViewTabs(basis_copy, 1);
+    CeedObjectView((CeedObject)basis_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&basis_copy);
+  }
+
+  CeedBasisDestroy(&basis);
   CeedDestroy(&ceed);
   return 0;
 }
diff --git a/tests/t302-basis.c b/tests/t302-basis.c
index 72623f400f..bfe94c8ef5 100644
--- a/tests/t302-basis.c
+++ b/tests/t302-basis.c
@@ -26,7 +26,7 @@ int main(int argc, char **argv) {
       if (fabs(collocated_gradient_1d[j + p * i] - gradient_1d[j + p * i]) > 100 * CEED_EPSILON) {
         // LCOV_EXCL_START
         printf("Error in collocated gradient %f != %f\n", collocated_gradient_1d[j + p * i], gradient_1d[j + p * i]);
-        // LCOV_EXCL_START
+        // LCOV_EXCL_STOP
       }
     }
   }
diff --git a/tests/t303-basis.c b/tests/t303-basis.c
index d71c97a6e7..baf844bf50 100644
--- a/tests/t303-basis.c
+++ b/tests/t303-basis.c
@@ -1,6 +1,6 @@
 /// @file
-/// Test checking BasisApply input/output vectors compatibility with basis dimensions
-/// \test Test checking BasisApply input/output vectors compatibility with basis dimensions
+/// Test checking BasisApply input/output vectors compatibility with basis
+/// \test Test checking BasisApply input/output vectors compatibility with basis
 
 //TESTARGS(only="cpu") {ceed_resource}
 #include <ceed.h>
@@ -15,7 +15,7 @@ int main(int argc, char **argv) {
   CeedInit(argv[1], &ceed);
 
   CeedVectorCreate(ceed, len, &u);
-  CeedVectorCreate(ceed, len + 1, &v);
+  CeedVectorCreate(ceed, len - 1, &v);
 
   CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis);
 
diff --git a/tests/t319-basis.c b/tests/t319-basis.c
index 8e542ad1c6..c314cb2e82 100644
--- a/tests/t319-basis.c
+++ b/tests/t319-basis.c
@@ -1,6 +1,7 @@
 /// @file
 /// Test projection interp and grad in multiple dimensions
 /// \test Test projection interp and grad in multiple dimensions
+#include "t319-basis.h"
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
@@ -34,6 +35,79 @@ static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) {
   return tol;
 }
 
+static void VerifyProjectedBasis(CeedBasis basis_project, CeedInt dim, CeedInt p_to_dim, CeedInt p_from_dim, CeedVector x_to, CeedVector x_from,
+                                 CeedVector u_to, CeedVector u_from, CeedVector du_to) {
+  CeedScalar tol;
+
+  {
+    CeedScalarType scalar_type;
+
+    CeedGetScalarType(&scalar_type);
+    tol = GetTolerance(scalar_type, dim);
+  }
+
+  // Setup coarse solution
+  {
+    const CeedScalar *x_array;
+    CeedScalar        u_array[p_from_dim];
+
+    CeedVectorGetArrayRead(x_from, CEED_MEM_HOST, &x_array);
+    for (CeedInt i = 0; i < p_from_dim; i++) {
+      CeedScalar coord[dim];
+      for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[p_from_dim * d + i];
+      u_array[i] = Eval(dim, coord);
+    }
+    CeedVectorRestoreArrayRead(x_from, &x_array);
+    CeedVectorSetArray(u_from, CEED_MEM_HOST, CEED_COPY_VALUES, u_array);
+  }
+
+  // Project to fine basis
+  CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u_from, u_to);
+
+  // Check solution
+  {
+    const CeedScalar *x_array, *u_array;
+
+    CeedVectorGetArrayRead(x_to, CEED_MEM_HOST, &x_array);
+    CeedVectorGetArrayRead(u_to, CEED_MEM_HOST, &u_array);
+    for (CeedInt i = 0; i < p_to_dim; i++) {
+      CeedScalar coord[dim];
+      for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * p_to_dim + i];
+      const CeedScalar u = Eval(dim, coord);
+      if (fabs(u - u_array[i]) > tol) printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, u_array[i], u);
+    }
+    CeedVectorRestoreArrayRead(x_to, &x_array);
+    CeedVectorRestoreArrayRead(u_to, &u_array);
+  }
+
+  // Project and take gradient
+  CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u_from, du_to);
+
+  // Check solution
+  {
+    const CeedScalar *x_array, *du_array;
+
+    CeedVectorGetArrayRead(x_to, CEED_MEM_HOST, &x_array);
+    CeedVectorGetArrayRead(du_to, CEED_MEM_HOST, &du_array);
+    for (CeedInt i = 0; i < p_to_dim; i++) {
+      CeedScalar coord[dim];
+
+      for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[p_to_dim * d + i];
+      for (CeedInt d = 0; d < dim; d++) {
+        const CeedScalar du = EvalGrad(d, coord);
+
+        if (fabs(du - du_array[p_to_dim * d + i]) > tol) {
+          // LCOV_EXCL_START
+          printf("[%" CeedInt_FMT ", %" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, d, du_array[p_to_dim * (dim - 1 - d) + i], du);
+          // LCOV_EXCL_STOP
+        }
+      }
+    }
+    CeedVectorRestoreArrayRead(x_to, &x_array);
+    CeedVectorRestoreArrayRead(du_to, &du_array);
+  }
+}
+
 int main(int argc, char **argv) {
   Ceed ceed;
 
@@ -42,15 +116,8 @@ int main(int argc, char **argv) {
   for (CeedInt dim = 1; dim <= 3; dim++) {
     CeedVector x_corners, x_from, x_to, u_from, u_to, du_to;
     CeedBasis  basis_x, basis_from, basis_to, basis_project;
-    CeedInt    p_from = 5, p_to = 6, q = 7, x_dim = CeedIntPow(2, dim), p_from_dim = CeedIntPow(p_from, dim), p_to_dim = CeedIntPow(p_to, dim);
-    CeedScalar tol;
+    CeedInt    p_from = 4, p_to = 5, q = 6, x_dim = CeedIntPow(2, dim), p_from_dim = CeedIntPow(p_from, dim), p_to_dim = CeedIntPow(p_to, dim);
 
-    {
-      CeedScalarType scalar_type;
-
-      CeedGetScalarType(&scalar_type);
-      tol = GetTolerance(scalar_type, dim);
-    }
     CeedVectorCreate(ceed, x_dim * dim, &x_corners);
     {
       CeedScalar x_array[x_dim * dim];
@@ -82,66 +149,46 @@ int main(int argc, char **argv) {
     CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_to, q, CEED_GAUSS, &basis_to);
     CeedBasisCreateProjection(basis_from, basis_to, &basis_project);
 
-    // Setup coarse solution
-    {
-      const CeedScalar *x_array;
-      CeedScalar        u_array[p_from_dim];
-
-      CeedVectorGetArrayRead(x_from, CEED_MEM_HOST, &x_array);
-      for (CeedInt i = 0; i < p_from_dim; i++) {
-        CeedScalar coord[dim];
-        for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[p_from_dim * d + i];
-        u_array[i] = Eval(dim, coord);
-      }
-      CeedVectorRestoreArrayRead(x_from, &x_array);
-      CeedVectorSetArray(u_from, CEED_MEM_HOST, CEED_COPY_VALUES, u_array);
-    }
-
-    // Project to fine basis
-    CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u_from, u_to);
+    VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to);
 
-    // Check solution
+    // Create non-tensor bases
+    CeedBasis basis_from_nontensor, basis_to_nontensor;
     {
-      const CeedScalar *x_array, *u_array;
-
-      CeedVectorGetArrayRead(x_to, CEED_MEM_HOST, &x_array);
-      CeedVectorGetArrayRead(u_to, CEED_MEM_HOST, &u_array);
-      for (CeedInt i = 0; i < p_to_dim; i++) {
-        CeedScalar coord[dim];
-        for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * p_to_dim + i];
-        const CeedScalar u = Eval(dim, coord);
-        if (fabs(u - u_array[i]) > tol) printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, u_array[i], u);
-      }
-      CeedVectorRestoreArrayRead(x_to, &x_array);
-      CeedVectorRestoreArrayRead(u_to, &u_array);
+      CeedElemTopology  topo;
+      CeedInt           num_comp, num_nodes, num_qpts;
+      const CeedScalar *interp, *grad;
+
+      CeedBasisGetTopology(basis_from, &topo);
+      CeedBasisGetNumComponents(basis_from, &num_comp);
+      CeedBasisGetNumNodes(basis_from, &num_nodes);
+      CeedBasisGetNumQuadraturePoints(basis_from, &num_qpts);
+      CeedBasisGetInterp(basis_from, &interp);
+      CeedBasisGetGrad(basis_from, &grad);
+      CeedBasisCreateH1(ceed, topo, num_comp, num_nodes, num_qpts, interp, grad, NULL, NULL, &basis_from_nontensor);
+
+      CeedBasisGetTopology(basis_to, &topo);
+      CeedBasisGetNumComponents(basis_to, &num_comp);
+      CeedBasisGetNumNodes(basis_to, &num_nodes);
+      CeedBasisGetNumQuadraturePoints(basis_to, &num_qpts);
+      CeedBasisGetInterp(basis_to, &interp);
+      CeedBasisGetGrad(basis_to, &grad);
+      CeedBasisCreateH1(ceed, topo, num_comp, num_nodes, num_qpts, interp, grad, NULL, NULL, &basis_to_nontensor);
     }
 
-    // Project and take gradient
-    CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u_from, du_to);
+    // Test projection on non-tensor bases
+    CeedBasisDestroy(&basis_project);
+    CeedBasisCreateProjection(basis_from_nontensor, basis_to_nontensor, &basis_project);
+    VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to);
 
-    // Check solution
-    {
-      const CeedScalar *x_array, *du_array;
-
-      CeedVectorGetArrayRead(x_to, CEED_MEM_HOST, &x_array);
-      CeedVectorGetArrayRead(du_to, CEED_MEM_HOST, &du_array);
-      for (CeedInt i = 0; i < p_to_dim; i++) {
-        CeedScalar coord[dim];
-
-        for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[p_to_dim * d + i];
-        for (CeedInt d = 0; d < dim; d++) {
-          const CeedScalar du = EvalGrad(d, coord);
-
-          if (fabs(du - du_array[p_to_dim * d + i]) > tol) {
-            // LCOV_EXCL_START
-            printf("[%" CeedInt_FMT ", %" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, d, du_array[p_to_dim * (dim - 1 - d) + i], du);
-            // LCOV_EXCL_STOP
-          }
-        }
-      }
-      CeedVectorRestoreArrayRead(x_to, &x_array);
-      CeedVectorRestoreArrayRead(du_to, &du_array);
-    }
+    // Test projection from non-tensor to tensor
+    CeedBasisDestroy(&basis_project);
+    CeedBasisCreateProjection(basis_from_nontensor, basis_to, &basis_project);
+    VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to);
+
+    // Test projection from tensor to non-tensor
+    CeedBasisDestroy(&basis_project);
+    CeedBasisCreateProjection(basis_from, basis_to_nontensor, &basis_project);
+    VerifyProjectedBasis(basis_project, dim, p_to_dim, p_from_dim, x_to, x_from, u_to, u_from, du_to);
 
     CeedVectorDestroy(&x_corners);
     CeedVectorDestroy(&x_from);
@@ -150,9 +197,52 @@ int main(int argc, char **argv) {
     CeedVectorDestroy(&u_to);
     CeedVectorDestroy(&du_to);
     CeedBasisDestroy(&basis_from);
+    CeedBasisDestroy(&basis_from_nontensor);
     CeedBasisDestroy(&basis_to);
+    CeedBasisDestroy(&basis_to_nontensor);
     CeedBasisDestroy(&basis_project);
   }
+
+  // Test projection between basis of different topological dimension
+  {
+    CeedInt   face_dim = 2, P_1D = 2;
+    CeedBasis basis_face, basis_cell_to_face, basis_proj;
+
+    CeedScalar       *q_ref = NULL, *q_weights = NULL;
+    const CeedScalar *grad, *interp;
+    CeedInt           P, Q;
+    GetCellToFaceTabulation(CEED_GAUSS, &P, &Q, &interp, &grad);
+
+    CeedBasisCreateTensorH1Lagrange(ceed, face_dim, 1, 2, P_1D, CEED_GAUSS, &basis_face);
+    CeedBasisCreateH1(ceed, CEED_TOPOLOGY_HEX, 1, P, Q, (CeedScalar *)interp, (CeedScalar *)grad, q_ref, q_weights, &basis_cell_to_face);
+    CeedBasisCreateProjection(basis_cell_to_face, basis_face, &basis_proj);
+    const CeedScalar *interp_proj, *grad_proj, *interp_proj_ref, *grad_proj_ref;
+
+    GetCellToFaceTabulation(CEED_GAUSS_LOBATTO, NULL, NULL, &interp_proj_ref, &grad_proj_ref);
+    CeedBasisGetInterp(basis_proj, &interp_proj);
+    CeedBasisGetGrad(basis_proj, &grad_proj);
+    CeedScalar tol = 100 * CEED_EPSILON;
+
+    for (CeedInt i = 0; i < 4 * 8; i++) {
+      if (fabs(interp_proj[i] - ((CeedScalar *)interp_proj_ref)[i]) > tol) {
+        // LCOV_EXCL_START
+        printf("Mixed Topology Projection: interp[%" CeedInt_FMT "] expected %f, got %f\n", i, interp_proj[i], ((CeedScalar *)interp_proj_ref)[i]);
+        // LCOV_EXCL_STOP
+      }
+    }
+
+    for (CeedInt i = 0; i < 3 * 4 * 8; i++) {
+      if (fabs(grad_proj[i] - ((CeedScalar *)grad_proj_ref)[i]) > tol) {
+        // LCOV_EXCL_START
+        printf("Mixed Topology Projection: grad[%" CeedInt_FMT "] expected %f, got %f\n", i, grad_proj[i], ((CeedScalar *)grad_proj_ref)[i]);
+        // LCOV_EXCL_STOP
+      }
+    }
+
+    CeedBasisDestroy(&basis_face);
+    CeedBasisDestroy(&basis_cell_to_face);
+    CeedBasisDestroy(&basis_proj);
+  }
   CeedDestroy(&ceed);
   return 0;
 }
diff --git a/tests/t319-basis.h b/tests/t319-basis.h
new file mode 100644
index 0000000000..965a7fcd0c
--- /dev/null
+++ b/tests/t319-basis.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+
+// Interpolation matrices for cell-to-face of Q1 hexahedral element onto it's "5" face (in PETSc)
+// Nodes are at Gauss-Lobatto points and quadrature points are Gauss, all over [-1,1] domain range
+const CeedScalar Q1_interp_gauss[4][8] = {
+    {0.62200846792814612,  0, 0.16666666666666669,  0, 0.16666666666666669,  0, 0.044658198738520463, 0},
+    {0.16666666666666669,  0, 0.62200846792814612,  0, 0.044658198738520463, 0, 0.16666666666666669,  0},
+    {0.16666666666666669,  0, 0.044658198738520463, 0, 0.62200846792814612,  0, 0.16666666666666669,  0},
+    {0.044658198738520463, 0, 0.16666666666666669,  0, 0.16666666666666669,  0, 0.62200846792814612,  0}
+};
+const CeedScalar Q1_grad_gauss[3][4][8] = {
+    {{-0.31100423396407312, 0.31100423396407312, -0.083333333333333343, 0.083333333333333343, -0.083333333333333343, 0.083333333333333343,
+      -0.022329099369260232, 0.022329099369260232},
+     {-0.083333333333333343, 0.083333333333333343, -0.31100423396407312, 0.31100423396407312, -0.022329099369260232, 0.022329099369260232,
+      -0.083333333333333343, 0.083333333333333343},
+     {-0.083333333333333343, 0.083333333333333343, -0.022329099369260232, 0.022329099369260232, -0.31100423396407312, 0.31100423396407312,
+      -0.083333333333333343, 0.083333333333333343},
+     {-0.022329099369260232, 0.022329099369260232, -0.083333333333333343, 0.083333333333333343, -0.083333333333333343, 0.083333333333333343,
+      -0.31100423396407312, 0.31100423396407312}                                                       },
+    {{-0.39433756729740643, 0, 0.39433756729740643, 0, -0.10566243270259357, 0, 0.10566243270259357, 0},
+     {-0.39433756729740643, 0, 0.39433756729740643, 0, -0.10566243270259357, 0, 0.10566243270259357, 0},
+     {-0.10566243270259357, 0, 0.10566243270259357, 0, -0.39433756729740643, 0, 0.39433756729740643, 0},
+     {-0.10566243270259357, 0, 0.10566243270259357, 0, -0.39433756729740643, 0, 0.39433756729740643, 0}},
+    {{-0.39433756729740643, 0, -0.10566243270259357, 0, 0.39433756729740643, 0, 0.10566243270259357, 0},
+     {-0.10566243270259357, 0, -0.39433756729740643, 0, 0.10566243270259357, 0, 0.39433756729740643, 0},
+     {-0.39433756729740643, 0, -0.10566243270259357, 0, 0.39433756729740643, 0, 0.10566243270259357, 0},
+     {-0.10566243270259357, 0, -0.39433756729740643, 0, 0.10566243270259357, 0, 0.39433756729740643, 0}}
+};
+
+const CeedScalar Q1_interp_gauss_lobatto[4][8] = {
+    {1, 0, 0, 0, 0, 0, 0, 0},
+    {0, 0, 1, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 1, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 1, 0}
+};
+/* clang-format off */
+const CeedScalar Q1_grad_gauss_lobatto[3][4][8] = {
+    {{-0.5,  0.5, 0,    0,   0,    0,   0,    0},
+      {0,    0,   -0.5, 0.5, 0,    0,   0,    0},
+      {0,    0,   0,    0,   -0.5, 0.5, 0,    0},
+      {0,    0,   0,    0,   0,    0,   -0.5, 0.5}},
+    {{-0.5,  0,   0.5,  0,   0,    0,   0,    0},
+      {-0.5, 0,   0.5,  0,   0,    0,   0,    0},
+      {0,    0,   0,    0,   -0.5, 0,   0.5,  0},
+      {0,    0,   0,    0,   -0.5, 0,   0.5,  0}},
+    {{-0.5,  0,   0,    0,   0.5,  0,   0,    0},
+      {0,    0,   -0.5, 0,   0,    0,   0.5,  0},
+      {-0.5, 0,   0,    0,   0.5,  0,   0,    0},
+      {0,    0,   -0.5, 0,   0,    0,   0.5,  0}}
+};
+/* clang-format on */
+
+static void GetCellToFaceTabulation(CeedQuadMode quad_mode, CeedInt *P, CeedInt *Q, const CeedScalar **interp, const CeedScalar **grad) {
+  if (P) *P = 8;
+  if (Q) *Q = 4;
+
+  if (quad_mode == CEED_GAUSS) {
+    *interp = (const CeedScalar *)Q1_interp_gauss;
+    *grad   = (const CeedScalar *)Q1_grad_gauss;
+  }
+  if (quad_mode == CEED_GAUSS_LOBATTO) {
+    *interp = (const CeedScalar *)Q1_interp_gauss_lobatto;
+    *grad   = (const CeedScalar *)Q1_grad_gauss_lobatto;
+  }
+}
diff --git a/tests/t320-basis-f.f90 b/tests/t320-basis-f.f90
index 46dffdede5..c8eb67fee8 100644
--- a/tests/t320-basis-f.f90
+++ b/tests/t320-basis-f.f90
@@ -32,6 +32,8 @@ program test
       call ceedbasiscreateh1(ceed,ceed_triangle,1,p,q,interp,grad,qref,qweight,&
      & b,err)
       call ceedbasisview(b,err)
+      call ceedbasissetnumviewtabs(b,1,err)
+      call ceedbasisview(b,err)
 
       call ceedbasisdestroy(b,err)
       call ceeddestroy(ceed,err)
diff --git a/tests/t320-basis-f.h b/tests/t320-basis-f.h
index 93129e54de..84e7486a10 100644
--- a/tests/t320-basis-f.h
+++ b/tests/t320-basis-f.h
@@ -1,4 +1,4 @@
-! Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+! Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 ! All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 !
 ! SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t320-basis.c b/tests/t320-basis.c
index c028fcd0a5..20309ec1ed 100644
--- a/tests/t320-basis.c
+++ b/tests/t320-basis.c
@@ -20,6 +20,8 @@ int main(int argc, char **argv) {
   Build2DSimplex(q_ref, q_weight, interp, grad);
   CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis);
   CeedBasisView(basis, stdout);
+  CeedBasisSetNumViewTabs(basis, 1);
+  CeedBasisView(basis, stdout);
 
   CeedBasisDestroy(&basis);
   CeedDestroy(&ceed);
diff --git a/tests/t320-basis.h b/tests/t320-basis.h
index ef38e43b0a..30f8e824d0 100644
--- a/tests/t320-basis.h
+++ b/tests/t320-basis.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t330-basis.c b/tests/t330-basis.c
index dfbf3373ff..bd96afd40d 100644
--- a/tests/t330-basis.c
+++ b/tests/t330-basis.c
@@ -21,6 +21,8 @@ int main(int argc, char **argv) {
   BuildHdivQuadrilateral(q, q_ref, q_weights, interp, div, CEED_GAUSS);
   CeedBasisCreateHdiv(ceed, CEED_TOPOLOGY_QUAD, 1, p, num_qpts, interp, div, q_ref, q_weights, &basis);
   CeedBasisView(basis, stdout);
+  CeedBasisSetNumViewTabs(basis, 1);
+  CeedBasisView(basis, stdout);
 
   CeedBasisDestroy(&basis);
   CeedDestroy(&ceed);
diff --git a/tests/t330-basis.h b/tests/t330-basis.h
index 82ae5a3d81..b75bd421b9 100644
--- a/tests/t330-basis.h
+++ b/tests/t330-basis.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t340-basis.c b/tests/t340-basis.c
index e9af85ff5f..8a70269bec 100644
--- a/tests/t340-basis.c
+++ b/tests/t340-basis.c
@@ -20,6 +20,8 @@ int main(int argc, char **argv) {
   BuildHcurl2DSimplex(q_ref, q_weight, interp, curl);
   CeedBasisCreateHcurl(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, curl, q_ref, q_weight, &basis);
   CeedBasisView(basis, stdout);
+  CeedBasisSetNumViewTabs(basis, 1);
+  CeedBasisView(basis, stdout);
 
   CeedBasisDestroy(&basis);
   CeedDestroy(&ceed);
diff --git a/tests/t340-basis.h b/tests/t340-basis.h
index 5fd8c420bc..90aef60f15 100644
--- a/tests/t340-basis.h
+++ b/tests/t340-basis.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t350-basis.c b/tests/t350-basis.c
index 54979bb9a0..becc0d98ea 100644
--- a/tests/t350-basis.c
+++ b/tests/t350-basis.c
@@ -56,7 +56,7 @@ int main(int argc, char **argv) {
 
     CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
   }
-  CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+  CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
 
   {
     const CeedScalar *x_array, *v_array;
diff --git a/tests/t351-basis.c b/tests/t351-basis.c
index 14b23730e1..84f59cc838 100644
--- a/tests/t351-basis.c
+++ b/tests/t351-basis.c
@@ -65,7 +65,7 @@ int main(int argc, char **argv) {
 
       CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
     }
-    CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
 
     {
       const CeedScalar *x_array, *v_array;
diff --git a/tests/t352-basis.c b/tests/t352-basis.c
index a4bf13d8b6..c2da0e2dd4 100644
--- a/tests/t352-basis.c
+++ b/tests/t352-basis.c
@@ -65,7 +65,7 @@ int main(int argc, char **argv) {
 
       CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
     }
-    CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
 
     {
       const CeedScalar *x_array, *v_array;
diff --git a/tests/t353-basis.c b/tests/t353-basis.c
index 22f80ddcdd..83fd16adb0 100644
--- a/tests/t353-basis.c
+++ b/tests/t353-basis.c
@@ -60,17 +60,18 @@ int main(int argc, char **argv) {
 
     CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
   }
-  CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+  CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
 
   for (CeedInt i = 0; i < num_points; i++) {
-    CeedScalar        fx = 0.0;
+    const CeedInt     num_point[1] = {1};
+    CeedScalar        fx           = 0.0;
     const CeedScalar *x_array, *u_array, *v_array, *u_point_array;
 
     CeedVectorGetArrayRead(x_points, CEED_MEM_HOST, &x_array);
     CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array);
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     CeedVectorSetValue(x_point, x_array[i]);
-    CeedBasisApplyAtPoints(basis_u, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
+    CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
     CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array);
     for (CeedInt j = 0; j < p; j++) fx += u_array[j] * u_point_array[j];
     if (fabs(v_array[i] - fx) > 100. * CEED_EPSILON) printf("%f != %f = f(%f)\n", v_array[i], fx, x_array[i]);
diff --git a/tests/t354-basis.c b/tests/t354-basis.c
index 85f0ac2293..4d3402b257 100644
--- a/tests/t354-basis.c
+++ b/tests/t354-basis.c
@@ -1,6 +1,6 @@
 /// @file
-/// Test polynomial interpolation to arbitrary points in multiple dimensions
-/// \test Test polynomial interpolation to arbitrary points in multiple dimensions
+/// Test polynomial interpolation transpose to arbitrary points in multiple dimensions
+/// \test Test polynomial interpolation transpose to arbitrary points in multiple dimensions
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
@@ -69,10 +69,11 @@ int main(int argc, char **argv) {
 
       CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
     }
-    CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
 
     for (CeedInt i = 0; i < num_points; i++) {
-      CeedScalar        fx = 0.0;
+      const CeedInt     num_point[1] = {1};
+      CeedScalar        fx           = 0.0;
       CeedScalar        coord[dim];
       const CeedScalar *x_array, *u_array, *v_array, *u_point_array;
 
@@ -81,10 +82,10 @@ int main(int argc, char **argv) {
       CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
       for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * num_points + i];
       CeedVectorSetArray(x_point, CEED_MEM_HOST, CEED_COPY_VALUES, coord);
-      CeedBasisApplyAtPoints(basis_u, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
+      CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
       CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array);
       for (CeedInt j = 0; j < p_dim; j++) fx += u_array[j] * u_point_array[j];
-      if (fabs(v_array[i] - fx) > 100. * CEED_EPSILON) {
+      if (fabs(v_array[i] - fx) > 500. * CEED_EPSILON) {
         // LCOV_EXCL_START
         printf("[%" CeedInt_FMT "] %f != %f = f(%f", dim, v_array[i], fx, coord[0]);
         for (CeedInt d = 1; d < dim; d++) printf(", %f", coord[d]);
diff --git a/tests/t355-basis.c b/tests/t355-basis.c
index 7fd7906dcb..5b93764a7a 100644
--- a/tests/t355-basis.c
+++ b/tests/t355-basis.c
@@ -62,7 +62,7 @@ int main(int argc, char **argv) {
 
     CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
   }
-  CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, v);
+  CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, v);
 
   {
     const CeedScalar *x_array, *v_array;
diff --git a/tests/t356-basis.c b/tests/t356-basis.c
index 8eb3c57e7c..263cc43b66 100644
--- a/tests/t356-basis.c
+++ b/tests/t356-basis.c
@@ -75,7 +75,7 @@ int main(int argc, char **argv) {
 
       CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
     }
-    CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, v);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, v);
 
     {
       const CeedScalar *x_array, *v_array;
diff --git a/tests/t357-basis.c b/tests/t357-basis.c
index ecfa56476c..0f4e105a66 100644
--- a/tests/t357-basis.c
+++ b/tests/t357-basis.c
@@ -82,8 +82,8 @@ int main(int argc, char **argv) {
     }
 
     // Calculate G u at arbitrary points, G' * 1 at dofs
-    CeedBasisApplyAtPoints(basis_u, num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, u_points);
-    CeedBasisApplyAtPoints(basis_u, num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, u_points);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v);
     {
       const CeedScalar *u_array, *v_array, *u_points_array;
 
diff --git a/tests/t360-basis.c b/tests/t360-basis.c
new file mode 100644
index 0000000000..f953157e1c
--- /dev/null
+++ b/tests/t360-basis.c
@@ -0,0 +1,56 @@
+/// @file
+/// Test interpolation ApplyAdd in multiple dimensions
+/// \test Test interpolation ApplyAdd in multiple dimensions
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt dim = 1; dim <= 3; dim++) {
+    CeedVector u, u_q, v, v_q, w_q;
+    CeedBasis  basis;
+    CeedInt    p = 4, q = 5, p_dim = CeedIntPow(p, dim), q_dim = CeedIntPow(q, dim);
+
+    CeedVectorCreate(ceed, p_dim, &u);
+    CeedVectorCreate(ceed, p_dim, &v);
+    CeedVectorSetValue(u, 1.0);
+    CeedVectorSetValue(v, 0.0);
+    CeedVectorCreate(ceed, q_dim, &u_q);
+    CeedVectorCreate(ceed, q_dim, &v_q);
+    CeedVectorCreate(ceed, q_dim, &w_q);
+
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis);
+
+    // Compute area
+    CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, u_q);
+    CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, w_q);
+    CeedVectorPointwiseMult(v_q, u_q, w_q);
+    CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v);
+    // Double area computed
+    CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v);
+
+    // Check area
+    {
+      const CeedScalar *v_array;
+      CeedScalar        area = 0.0;
+
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      for (CeedInt i = 0; i < p_dim; i++) area += v_array[i];
+      if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 5E-6) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim));
+      CeedVectorRestoreArrayRead(v, &v_array);
+    }
+
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&v);
+    CeedVectorDestroy(&u_q);
+    CeedVectorDestroy(&v_q);
+    CeedVectorDestroy(&w_q);
+    CeedBasisDestroy(&basis);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t361-basis.c b/tests/t361-basis.c
new file mode 100644
index 0000000000..6671a39ae5
--- /dev/null
+++ b/tests/t361-basis.c
@@ -0,0 +1,116 @@
+/// @file
+/// Test grad ApplyAdd in multiple dimensions
+/// \test Test grad ApplyAdd in multiple dimensions
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) {
+  CeedScalar result = tanh(x[0] + 0.1);
+  if (dim > 1) result += atan(x[1] + 0.2);
+  if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3));
+  return result;
+}
+
+static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) {
+  CeedScalar tol;
+  if (scalar_type == CEED_SCALAR_FP32) {
+    if (dim == 3) tol = 0.05;
+    else tol = 1.e-3;
+  } else {
+    tol = 1.e-10;
+  }
+  return 2.0 * tol;
+}
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt dim = 1; dim <= 3; dim++) {
+    CeedVector x, x_q, u, u_q, ones, v;
+    CeedBasis  basis_x_lobatto, basis_u_gauss;
+    CeedInt    p = 8, q = 10, p_dim = CeedIntPow(p, dim), q_dim = CeedIntPow(q, dim), x_dim = CeedIntPow(2, dim);
+    CeedScalar sum_1 = 0, sum_2 = 0;
+
+    CeedVectorCreate(ceed, x_dim * dim, &x);
+    {
+      CeedScalar x_array[x_dim * dim];
+
+      for (CeedInt d = 0; d < dim; d++) {
+        for (CeedInt i = 0; i < x_dim; i++) x_array[d * x_dim + i] = (i % CeedIntPow(2, d + 1)) / CeedIntPow(2, d) ? 1 : -1;
+      }
+      CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    CeedVectorCreate(ceed, p_dim * dim, &x_q);
+    CeedVectorSetValue(x_q, 0);
+    CeedVectorCreate(ceed, p_dim, &u);
+    CeedVectorCreate(ceed, q_dim * dim, &u_q);
+    CeedVectorSetValue(u_q, 0);
+    CeedVectorCreate(ceed, q_dim * dim, &ones);
+    CeedVectorSetValue(ones, 1);
+    CeedVectorCreate(ceed, p_dim, &v);
+    CeedVectorSetValue(v, 0);
+
+    // Get function values at quadrature points
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, p, CEED_GAUSS_LOBATTO, &basis_x_lobatto);
+    CeedBasisApply(basis_x_lobatto, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_q);
+
+    {
+      const CeedScalar *x_q_array;
+      CeedScalar        u_array[p_dim];
+
+      CeedVectorGetArrayRead(x_q, CEED_MEM_HOST, &x_q_array);
+      for (CeedInt i = 0; i < p_dim; i++) {
+        CeedScalar coord[dim];
+
+        for (CeedInt d = 0; d < dim; d++) coord[d] = x_q_array[d * p_dim + i];
+        u_array[i] = Eval(dim, coord);
+      }
+      CeedVectorRestoreArrayRead(x_q, &x_q_array);
+      CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, u_array);
+    }
+
+    // Calculate G u at quadrature points, G' * 1 at dofs
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u_gauss);
+    CeedBasisApply(basis_u_gauss, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u, u_q);
+    CeedVectorScale(u_q, 2.0);
+    CeedBasisApply(basis_u_gauss, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, ones, v);
+    CeedBasisApplyAdd(basis_u_gauss, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, ones, v);
+
+    // Check if 1' * G * u = u' * (G' * 1)
+    {
+      const CeedScalar *v_array, *u_array, *u_q_array;
+
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array);
+      CeedVectorGetArrayRead(u_q, CEED_MEM_HOST, &u_q_array);
+      for (CeedInt i = 0; i < p_dim; i++) sum_1 += v_array[i] * u_array[i];
+      for (CeedInt i = 0; i < dim * q_dim; i++) sum_2 += u_q_array[i];
+      CeedVectorRestoreArrayRead(v, &v_array);
+      CeedVectorRestoreArrayRead(u, &u_array);
+      CeedVectorRestoreArrayRead(u_q, &u_q_array);
+    }
+    {
+      CeedScalarType scalar_type;
+
+      CeedGetScalarType(&scalar_type);
+
+      CeedScalar tol = GetTolerance(scalar_type, dim);
+
+      if (fabs(sum_1 - sum_2) > tol) printf("[%" CeedInt_FMT "] %0.12f != %0.12f\n", dim, sum_1, sum_2);
+    }
+
+    CeedVectorDestroy(&x);
+    CeedVectorDestroy(&x_q);
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&u_q);
+    CeedVectorDestroy(&ones);
+    CeedVectorDestroy(&v);
+    CeedBasisDestroy(&basis_x_lobatto);
+    CeedBasisDestroy(&basis_u_gauss);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t362-basis.c b/tests/t362-basis.c
new file mode 100644
index 0000000000..bff1937d66
--- /dev/null
+++ b/tests/t362-basis.c
@@ -0,0 +1,59 @@
+/// @file
+/// Test integration ApplyAdd with a 2D Simplex non-tensor H^1 basis
+/// \test Test integration ApplyAdd with a 2D Simplex non-tensor H^1 basis
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "t320-basis.h"
+
+// main test
+int main(int argc, char **argv) {
+  Ceed          ceed;
+  CeedVector    u, v, u_q, v_q, w_q;
+  const CeedInt p = 6, q = 4, dim = 2;
+  CeedBasis     basis;
+  CeedScalar    q_ref[dim * q], q_weight[q];
+  CeedScalar    interp[p * q], grad[dim * p * q];
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, p, &u);
+  CeedVectorCreate(ceed, p, &v);
+  CeedVectorSetValue(u, 1.0);
+  CeedVectorSetValue(v, 0.0);
+  CeedVectorCreate(ceed, q, &u_q);
+  CeedVectorCreate(ceed, q, &v_q);
+  CeedVectorCreate(ceed, q, &w_q);
+
+  Build2DSimplex(q_ref, q_weight, interp, grad);
+  CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis);
+
+  // Compute area
+  CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, u_q);
+  CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, w_q);
+  CeedVectorPointwiseMult(v_q, u_q, w_q);
+  CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v);
+  // Double area computed
+  CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, v_q, v);
+
+  // Check area
+  {
+    const CeedScalar *v_array;
+    CeedScalar        area = 0.0;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < p; i++) area += v_array[i];
+    if (fabs(area - 1.0) > 1E-6) printf("Incorrect area computed %f != %f\n", area, 1.0);
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&u_q);
+  CeedVectorDestroy(&v_q);
+  CeedVectorDestroy(&w_q);
+  CeedBasisDestroy(&basis);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t363-basis.c b/tests/t363-basis.c
new file mode 100644
index 0000000000..6c19f34027
--- /dev/null
+++ b/tests/t363-basis.c
@@ -0,0 +1,54 @@
+/// @file
+/// Test grad transpose ApplyAdd with a 2D Simplex non-tensor H^1 basis
+/// \test Test grad transpose ApplyAdd with a 2D Simplex non-tensor H^1 basis
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "t320-basis.h"
+
+int main(int argc, char **argv) {
+  Ceed          ceed;
+  CeedVector    u, v;
+  const CeedInt p = 6, q = 4, dim = 2;
+  CeedBasis     basis;
+  CeedScalar    q_ref[dim * q], q_weight[q];
+  CeedScalar    interp[p * q], grad[dim * p * q];
+  CeedScalar    column_sum[p];
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, q * dim, &u);
+  CeedVectorSetValue(u, 1);
+  CeedVectorCreate(ceed, p, &v);
+  CeedVectorSetValue(v, 0);
+
+  Build2DSimplex(q_ref, q_weight, interp, grad);
+  CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, p, q, interp, grad, q_ref, q_weight, &basis);
+
+  CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, u, v);
+  CeedBasisApplyAdd(basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, u, v);
+
+  // Check values at quadrature points
+  for (int i = 0; i < p; i++) {
+    column_sum[i] = 0;
+    for (int j = 0; j < q * dim; j++) {
+      column_sum[i] += grad[i + j * p];
+    }
+  }
+  {
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (int i = 0; i < p; i++) {
+      if (fabs(column_sum[i] - v_array[i] / 2.0) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] %f != %f\n", i, v_array[i] / 2.0, column_sum[i]);
+    }
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedBasisDestroy(&basis);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t364-basis.c b/tests/t364-basis.c
new file mode 100644
index 0000000000..6ab4058d30
--- /dev/null
+++ b/tests/t364-basis.c
@@ -0,0 +1,98 @@
+/// @file
+/// Test polynomial interpolation transpose ApplyAdd from arbitrary points in 1D
+/// \test Test polynomial interpolation transpose ApplyAdd from arbitrary points in 1D
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+#define ALEN(a) (sizeof(a) / sizeof((a)[0]))
+
+static CeedScalar Eval(CeedScalar x, CeedInt n, const CeedScalar *c) {
+  CeedScalar y = c[n - 1];
+  for (CeedInt i = n - 2; i >= 0; i--) y = y * x + c[i];
+  return y;
+}
+
+int main(int argc, char **argv) {
+  Ceed             ceed;
+  CeedVector       x, x_nodes, x_points, x_point, u, v, u_point, v_point;
+  CeedBasis        basis_x, basis_u;
+  const CeedInt    p = 5, q = 5, num_points = 4;
+  const CeedScalar c[4] = {1, 2, 3, 4};  // 1 + 2x + 3x^2 + ...
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, 2, &x);
+  CeedVectorCreate(ceed, p, &x_nodes);
+  CeedVectorCreate(ceed, num_points, &x_points);
+  CeedVectorCreate(ceed, 1, &x_point);
+  CeedVectorCreate(ceed, p, &u);
+  CeedVectorCreate(ceed, num_points, &v);
+  CeedVectorCreate(ceed, p, &u_point);
+  CeedVectorCreate(ceed, 1, &v_point);
+  CeedVectorSetValue(v_point, 1.0);
+
+  // Get nodal coordinates
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, p, CEED_GAUSS_LOBATTO, &basis_x);
+  {
+    CeedScalar x_array[2];
+
+    for (CeedInt i = 0; i < 2; i++) x_array[i] = CeedIntPow(-1, i + 1);
+    CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedBasisApply(basis_x, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_nodes);
+
+  // Set values of u at nodes
+  {
+    const CeedScalar *x_array;
+    CeedScalar        u_array[p];
+
+    CeedVectorGetArrayRead(x_nodes, CEED_MEM_HOST, &x_array);
+    for (CeedInt i = 0; i < p; i++) u_array[i] = Eval(x_array[i], ALEN(c), c);
+    CeedVectorRestoreArrayRead(x_nodes, &x_array);
+    CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, (CeedScalar *)&u_array);
+  }
+
+  // Interpolate to arbitrary points
+  CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, p, q, CEED_GAUSS, &basis_u);
+  {
+    CeedScalar x_array[4] = {-0.33, -0.65, 0.16, 0.99};
+
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x_points, u, v);
+
+  for (CeedInt i = 0; i < num_points; i++) {
+    const CeedInt     num_point[1] = {1};
+    CeedScalar        fx           = 0.0;
+    const CeedScalar *x_array, *u_array, *v_array, *u_point_array;
+
+    CeedVectorGetArrayRead(x_points, CEED_MEM_HOST, &x_array);
+    CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array);
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    CeedVectorSetValue(x_point, x_array[i]);
+    CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
+    // Double it
+    CeedBasisApplyAddAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
+    CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array);
+    for (CeedInt j = 0; j < p; j++) fx += u_array[j] * u_point_array[j];
+    if (fabs(v_array[i] * 2.0 - fx) > 100. * CEED_EPSILON) printf("%f != %f = f(%f)\n", v_array[i] * 2.0, fx, x_array[i]);
+    CeedVectorRestoreArrayRead(u_point, &u_point_array);
+    CeedVectorRestoreArrayRead(x_points, &x_array);
+    CeedVectorRestoreArrayRead(u, &u_array);
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  CeedVectorDestroy(&x);
+  CeedVectorDestroy(&x_nodes);
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&x_point);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&u_point);
+  CeedVectorDestroy(&v_point);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t365-basis.c b/tests/t365-basis.c
new file mode 100644
index 0000000000..74f93ce881
--- /dev/null
+++ b/tests/t365-basis.c
@@ -0,0 +1,123 @@
+/// @file
+/// Test gradient transpose ApplyAdd in multiple dimensions at arbitrary points
+/// \test Test gradient transpose ApplyAdd in multiple dimensions at arbitrary points
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) {
+  CeedScalar result = tanh(x[0] + 0.1);
+  if (dim > 1) result += atan(x[1] + 0.2);
+  if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3));
+  return result;
+}
+
+static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) {
+  CeedScalar tol;
+  if (scalar_type == CEED_SCALAR_FP32) {
+    if (dim == 3) tol = 0.005;
+    else tol = 1.e-4;
+  } else {
+    tol = 1.e-11;
+  }
+  return tol;
+}
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt dim = 1; dim <= 3; dim++) {
+    CeedVector    x, x_nodes, x_points, u, u_points, v, ones;
+    CeedBasis     basis_x, basis_u;
+    const CeedInt p = 9, q = 9, num_points = 4, x_dim = CeedIntPow(2, dim), p_dim = CeedIntPow(p, dim);
+    CeedScalar    sum_1 = 0, sum_2 = 0;
+
+    CeedVectorCreate(ceed, x_dim * dim, &x);
+    CeedVectorCreate(ceed, p_dim * dim, &x_nodes);
+    CeedVectorCreate(ceed, num_points * dim, &x_points);
+    CeedVectorCreate(ceed, p_dim, &u);
+    CeedVectorCreate(ceed, num_points * dim, &u_points);
+    CeedVectorCreate(ceed, p_dim, &v);
+    CeedVectorCreate(ceed, num_points * dim, &ones);
+
+    CeedVectorSetValue(ones, 1);
+    CeedVectorSetValue(v, 0);
+
+    // Get nodal coordinates
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, p, CEED_GAUSS_LOBATTO, &basis_x);
+    {
+      CeedScalar x_array[x_dim * dim];
+
+      for (CeedInt d = 0; d < dim; d++) {
+        for (CeedInt i = 0; i < x_dim; i++) x_array[d * x_dim + i] = (i % CeedIntPow(2, d + 1)) / CeedIntPow(2, d) ? 1 : -1;
+      }
+      CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    CeedBasisApply(basis_x, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, x, x_nodes);
+
+    // Set values of u at nodes
+    {
+      const CeedScalar *x_array;
+      CeedScalar        u_array[p_dim];
+
+      CeedVectorGetArrayRead(x_nodes, CEED_MEM_HOST, &x_array);
+      for (CeedInt i = 0; i < p_dim; i++) {
+        CeedScalar coord[dim];
+
+        for (CeedInt d = 0; d < dim; d++) coord[d] = x_array[d * p_dim + i];
+        u_array[i] = Eval(dim, coord);
+      }
+      CeedVectorRestoreArrayRead(x_nodes, &x_array);
+      CeedVectorSetArray(u, CEED_MEM_HOST, CEED_COPY_VALUES, (CeedScalar *)&u_array);
+    }
+
+    // Interpolate to arbitrary points
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+    {
+      CeedScalar x_array[12] = {-0.33, -0.65, 0.16, 0.99, -0.65, 0.16, 0.99, -0.33, 0.16, 0.99, -0.33, -0.65};
+
+      CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+
+    // Calculate G u at arbitrary points, G' * 1 at dofs
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, x_points, u, u_points);
+    CeedBasisApplyAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v);
+    // Double it
+    CeedBasisApplyAddAtPoints(basis_u, 1, &num_points, CEED_TRANSPOSE, CEED_EVAL_GRAD, x_points, ones, v);
+    {
+      const CeedScalar *u_array, *v_array, *u_points_array;
+
+      CeedVectorGetArrayRead(u, CEED_MEM_HOST, &u_array);
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      CeedVectorGetArrayRead(u_points, CEED_MEM_HOST, &u_points_array);
+      for (CeedInt i = 0; i < p_dim; i++) sum_1 += v_array[i] * u_array[i];
+      for (CeedInt i = 0; i < num_points * dim; i++) sum_2 += u_points_array[i];
+      CeedVectorRestoreArrayRead(u, &u_array);
+      CeedVectorRestoreArrayRead(v, &v_array);
+      CeedVectorRestoreArrayRead(u_points, &u_points_array);
+    }
+    {
+      CeedScalarType scalar_type;
+
+      CeedGetScalarType(&scalar_type);
+
+      CeedScalar tol = GetTolerance(scalar_type, dim);
+
+      if (fabs(sum_1 - 2.0 * sum_2) > tol) printf("[%" CeedInt_FMT "] %f != %f\n", dim, sum_1, 2.0 * sum_2);
+    }
+
+    CeedVectorDestroy(&x);
+    CeedVectorDestroy(&x_nodes);
+    CeedVectorDestroy(&x_points);
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&u_points);
+    CeedVectorDestroy(&ones);
+    CeedVectorDestroy(&v);
+    CeedBasisDestroy(&basis_x);
+    CeedBasisDestroy(&basis_u);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t400-qfunction.h b/tests/t400-qfunction.h
index 1fb64842fd..740c7da030 100644
--- a/tests/t400-qfunction.h
+++ b/tests/t400-qfunction.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *w      = in[0];
diff --git a/tests/t401-qfunction.h b/tests/t401-qfunction.h
index c61cdb8ac6..f91dae701c 100644
--- a/tests/t401-qfunction.h
+++ b/tests/t401-qfunction.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *w      = in[0];
diff --git a/tests/t402-qfunction-f.f90 b/tests/t402-qfunction-f.f90
index e87bcc4f2b..45cac092ce 100644
--- a/tests/t402-qfunction-f.f90
+++ b/tests/t402-qfunction-f.f90
@@ -49,6 +49,11 @@ program test
      & ctxdata,coffset,err)
       call ceedqfunctioncontextview(ctx,err)
 
+      call ceedqfunctionsetnumviewtabs(qf_mass,1,err)
+      call ceedqfunctionview(qf_mass,err)
+      call ceedqfunctioncontextsetnumviewtabs(ctx,1,err)
+      call ceedqfunctioncontextview(ctx,err)
+
       call ceedqfunctiondestroy(qf_setup,err)
       call ceedqfunctiondestroy(qf_mass,err)
       call ceeddestroy(ceed,err)
diff --git a/tests/t402-qfunction.c b/tests/t402-qfunction.c
index 2f80666e15..6f24d492f0 100644
--- a/tests/t402-qfunction.c
+++ b/tests/t402-qfunction.c
@@ -34,6 +34,22 @@ int main(int argc, char **argv) {
   }
   CeedQFunctionContextView(ctx, stdout);
 
+  // Check tabs and CeedObject functionality
+  {
+    CeedQFunction        qf_mass_copy = NULL;
+    CeedQFunctionContext ctx_copy     = NULL;
+
+    CeedQFunctionReferenceCopy(qf_mass, &qf_mass_copy);
+    CeedQFunctionSetNumViewTabs(qf_mass_copy, 1);
+    CeedObjectView((CeedObject)qf_mass_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&qf_mass_copy);
+
+    CeedQFunctionContextReferenceCopy(ctx, &ctx_copy);
+    CeedQFunctionContextSetNumViewTabs(ctx_copy, 1);
+    CeedObjectView((CeedObject)ctx_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&ctx_copy);
+  }
+
   CeedQFunctionDestroy(&qf_setup);
   CeedQFunctionDestroy(&qf_mass);
   CeedQFunctionContextDestroy(&ctx);
diff --git a/tests/t405-qfunction.h b/tests/t405-qfunction.h
index eaf261791f..0c356c943e 100644
--- a/tests/t405-qfunction.h
+++ b/tests/t405-qfunction.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *w      = in[0];
diff --git a/tests/t406-qfunction-helper.h b/tests/t406-qfunction-helper.h
index 9db4901023..0410b00af6 100644
--- a/tests/t406-qfunction-helper.h
+++ b/tests/t406-qfunction-helper.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
@@ -10,11 +10,15 @@
 # pragma  once
 // clang-format on
 
+// Note - ceed/types.h should be used over ceed.h
 #include <ceed.h>
 
 // Test include path with "/./"
 #include "./t406-qfunction-scales.h"
 
-CEED_QFUNCTION_HELPER CeedScalar times_two(CeedScalar x) { return SCALE_TWO * x; }
+// Test include via -I....
+#include <fake-sys-include.h>
 
-CEED_QFUNCTION_HELPER CeedScalar times_three(CeedScalar x) { return SCALE_THREE * x; }
+CEED_QFUNCTION_HELPER CeedScalar times_two(CeedScalar x) { return FAKE_SYS_SCALE_ONE * SCALE_TWO * x; }
+
+CEED_QFUNCTION_HELPER CeedScalar times_three(CeedScalar x) { return FAKE_SYS_SCALE_ONE * SCALE_THREE * x; }
diff --git a/tests/t406-qfunction-scales.h b/tests/t406-qfunction-scales.h
index cde93275ff..7dc42e93c6 100644
--- a/tests/t406-qfunction-scales.h
+++ b/tests/t406-qfunction-scales.h
@@ -3,7 +3,7 @@
 // Testing # on first line
 // Note: #ifndef and #pragma once header guards both work
 
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
diff --git a/tests/t406-qfunction.c b/tests/t406-qfunction.c
index 201c3782a4..d593f8f73d 100644
--- a/tests/t406-qfunction.c
+++ b/tests/t406-qfunction.c
@@ -18,6 +18,14 @@ int main(int argc, char **argv) {
   CeedScalar    v_true[q];
 
   CeedInit(argv[1], &ceed);
+  {
+    char  file_path[2056] = __FILE__;
+    char *last_slash      = strrchr(file_path, '/');
+
+    memcpy(&file_path[last_slash - file_path], "/test-include/", 15);
+    CeedAddJitSourceRoot(ceed, file_path);
+    CeedAddJitDefine(ceed, "COMPILER_DEFINED_SCALE=42");
+  }
 
   CeedVectorCreate(ceed, q, &w);
   CeedVectorCreate(ceed, q, &u);
@@ -64,9 +72,9 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < q; i++) {
-      if (fabs(5 * v_true[i] * sqrt(2.) - v_array[i]) > 1E3 * CEED_EPSILON) {
+      if (fabs(5 * COMPILER_DEFINED_SCALE * v_true[i] * sqrt(2.) - v_array[i]) > 5E3 * CEED_EPSILON) {
         // LCOV_EXCL_START
-        printf("[%" CeedInt_FMT "] v_true %f != v %f\n", i, 5 * v_true[i] * sqrt(2.), v_array[i]);
+        printf("[%" CeedInt_FMT "] v_true %f != v %f\n", i, 5 * COMPILER_DEFINED_SCALE * v_true[i] * sqrt(2.), v_array[i]);
         // LCOV_EXCL_STOP
       }
     }
diff --git a/tests/t406-qfunction.h b/tests/t406-qfunction.h
index f4782f7029..617e6b7875 100644
--- a/tests/t406-qfunction.h
+++ b/tests/t406-qfunction.h
@@ -1,24 +1,33 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-// Note: intentionally testing strange spacing in '#include's
+// Note: intentionally testing strange spacing in include's
 // clang-format off
+// Note - ceed/types.h should be used over ceed.h
 #include <ceed.h>
-#  include  <math.h>
+// Note - system headers like math.h and std*.h should be guarded
+#ifndef CEED_RUNNING_JIT_PASS
+#  include <math.h>
+#endif
 
 #include "t406-qfunction-helper.h"
 // Test duplicate includes of guarded files
 // Also test include path with "/../"
 #include "../tests/t406-qfunction-helper.h"
 // Also test include path with "/../../"
-#include "../../libCEED/tests/t406-qfunction-helper.h"
+#include "output/../../tests/t406-qfunction-helper.h"
 #  include "t406-qfunction-scales.h"
 // clang-format on
 
+// Extra define set via CeedAddJitDefine() during JiT
+#ifndef CEED_RUNNING_JIT_PASS
+#define COMPILER_DEFINED_SCALE 42
+#endif
+
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *w      = in[0];
   CeedScalar       *q_data = out[0];
@@ -32,7 +41,7 @@ CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, Ce
   const CeedScalar *q_data = in[0], *u = in[1];
   CeedScalar       *v = out[0];
   for (CeedInt i = 0; i < Q; i++) {
-    v[i] = q_data[i] * (times_two(u[i]) + times_three(u[i])) * sqrt(1.0 * SCALE_TWO);
+    v[i] = q_data[i] * COMPILER_DEFINED_SCALE * (times_two(u[i]) + times_three(u[i])) * sqrt(1.0 * SCALE_TWO);
   }
   return 0;
 }
diff --git a/tests/t409-qfunction.c b/tests/t409-qfunction.c
index 44029cc797..5f17a5614a 100644
--- a/tests/t409-qfunction.c
+++ b/tests/t409-qfunction.c
@@ -74,6 +74,7 @@ int main(int argc, char **argv) {
   CeedQFunctionContextRestoreData(ctx, &ctx_data_new);
   is_writable = false;
   CeedQFunctionSetContextWritable(qf, is_writable);
+
   {
     in[0]  = u;
     out[0] = v;
diff --git a/tests/t409-qfunction.h b/tests/t409-qfunction.h
index 27e2c6585e..b2f59a9f80 100644
--- a/tests/t409-qfunction.h
+++ b/tests/t409-qfunction.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   CeedScalar       *scale = (CeedScalar *)ctx;
diff --git a/tests/t413-qfunction-f.f90 b/tests/t413-qfunction-f.f90
index c6a43d7361..754d881a4d 100644
--- a/tests/t413-qfunction-f.f90
+++ b/tests/t413-qfunction-f.f90
@@ -15,6 +15,8 @@ program test
 
       call ceedqfunctionview(qf_setup,err)
       call ceedqfunctionview(qf_mass,err)
+      call ceedqfunctionsetnumviewtabs(qf_mass,1,err)
+      call ceedqfunctionview(qf_mass,err)
 
       call ceedqfunctiondestroy(qf_setup,err)
       call ceedqfunctiondestroy(qf_mass,err)
diff --git a/tests/t413-qfunction.c b/tests/t413-qfunction.c
index 690502ae76..aeecdd639f 100644
--- a/tests/t413-qfunction.c
+++ b/tests/t413-qfunction.c
@@ -14,6 +14,8 @@ int main(int argc, char **argv) {
 
   CeedQFunctionView(qf_setup, stdout);
   CeedQFunctionView(qf_mass, stdout);
+  CeedQFunctionSetNumViewTabs(qf_mass, 1);
+  CeedQFunctionView(qf_mass, stdout);
 
   CeedQFunctionDestroy(&qf_setup);
   CeedQFunctionDestroy(&qf_mass);
diff --git a/tests/t500-operator.h b/tests/t500-operator.h
index de9ca8966a..935d077208 100644
--- a/tests/t500-operator.h
+++ b/tests/t500-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *dxdX = in[1];
diff --git a/tests/t502-operator.h b/tests/t502-operator.h
index 9d343b5ab9..fab809d8db 100644
--- a/tests/t502-operator.h
+++ b/tests/t502-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *dxdX = in[1];
diff --git a/tests/t504-operator-f.f90 b/tests/t504-operator-f.f90
index ccc87614bc..beedfda264 100644
--- a/tests/t504-operator-f.f90
+++ b/tests/t504-operator-f.f90
@@ -92,7 +92,10 @@ program test
       call ceedoperatorsetfield(op_mass,'v',erestrictu,bu,&
      & ceed_vector_active,err)
 
+      call ceedoperatorsetname(op_setup,'setup',err)
       call ceedoperatorview(op_setup,err)
+      call ceedoperatorsetname(op_mass,'mass',err)
+      call ceedoperatorsetnumviewtabs(op_mass,1,err)
       call ceedoperatorview(op_mass,err)
 
       call ceedvectordestroy(qdata,err)
diff --git a/tests/t504-operator.c b/tests/t504-operator.c
index ce5e7bb0c2..41dfcc7962 100644
--- a/tests/t504-operator.c
+++ b/tests/t504-operator.c
@@ -66,8 +66,21 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
 
+  CeedOperatorSetName(op_setup, "setup");
+  CeedOperatorViewTerse(op_setup, stdout);
   CeedOperatorView(op_setup, stdout);
-  CeedOperatorView(op_mass, stdout);
+
+  // Check tabs and CeedObject functionality
+  {
+    CeedOperator op_mass_copy = NULL;
+
+    CeedOperatorReferenceCopy(op_mass, &op_mass_copy);
+    CeedOperatorSetName(op_mass_copy, "mass");
+    CeedOperatorSetNumViewTabs(op_mass_copy, 1);
+    CeedOperatorViewTerse(op_mass_copy, stdout);
+    CeedObjectView((CeedObject)op_mass_copy, stdout);
+    CeedObjectDestroy((CeedObject *)&op_mass_copy);
+  }
 
   CeedVectorDestroy(&q_data);
   CeedElemRestrictionDestroy(&elem_restriction_u);
diff --git a/tests/t507-operator.h b/tests/t507-operator.h
index 5d245534be..312500b35f 100644
--- a/tests/t507-operator.h
+++ b/tests/t507-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *dxdX = in[1];
diff --git a/tests/t510-operator.h b/tests/t510-operator.h
index 01cf47450c..171f9d01df 100644
--- a/tests/t510-operator.h
+++ b/tests/t510-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *J = in[1];
diff --git a/tests/t520-operator-f.f90 b/tests/t520-operator-f.f90
index 628ce3735c..3882ddc1e0 100644
--- a/tests/t520-operator-f.f90
+++ b/tests/t520-operator-f.f90
@@ -211,13 +211,13 @@ program test
      & buhex,ceed_vector_active,err)
 
 ! Composite Operators
-      call ceedcompositeoperatorcreate(ceed,op_setup,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err)
+      call ceedoperatorcreatecomposite(ceed,op_setup,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err)
 
-      call ceedcompositeoperatorcreate(ceed,op_mass,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masstet,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masshex,err)
+      call ceedoperatorcreatecomposite(ceed,op_mass,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masstet,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masshex,err)
 
 ! Apply Setup Operator
       call ceedoperatorapply(op_setup,x,ceed_vector_none,&
diff --git a/tests/t520-operator.c b/tests/t520-operator.c
index 9035234ffa..5632b3e2d6 100644
--- a/tests/t520-operator.c
+++ b/tests/t520-operator.c
@@ -111,6 +111,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
   CeedOperatorSetField(op_mass_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetName(op_mass_tet, "mass tet");
 
   // Set up Hex Elements
   // -- Restrictions
@@ -154,19 +155,30 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
   CeedOperatorSetField(op_mass_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetName(op_mass_hex, "mass hex");
 
   // Set up Composite Operators
   // -- Create
-  CeedCompositeOperatorCreate(ceed, &op_setup);
+  CeedOperatorCreateComposite(ceed, &op_setup);
   // -- Add SubOperators
-  CeedCompositeOperatorAddSub(op_setup, op_setup_tet);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_hex);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_tet);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_hex);
 
   // -- Create
-  CeedCompositeOperatorCreate(ceed, &op_mass);
+  CeedOperatorCreateComposite(ceed, &op_mass);
   // -- Add SubOperators
-  CeedCompositeOperatorAddSub(op_mass, op_mass_tet);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_tet);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_hex);
+
+  {  // Test CeedOperatorCompositeGetSubByName
+    CeedOperator op_byname;
+
+    CeedOperatorCompositeGetSubByName(op_mass, "mass hex", &op_byname);
+    if (op_byname != op_mass_hex) printf("CeedOperatorCompositeGetSubByName returned incorrect Sub Operator");
+
+    CeedOperatorCompositeGetSubByName(op_mass, "asdf", &op_byname);
+    if (op_byname != NULL) printf("CeedOperatorCompositeGetSubByName returned non-NULL for non-existent Sub Operator");
+  }
 
   // Apply Setup Operator
   CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE);
diff --git a/tests/t521-operator-f.f90 b/tests/t521-operator-f.f90
index 20ab09eb7b..73fff92d7a 100644
--- a/tests/t521-operator-f.f90
+++ b/tests/t521-operator-f.f90
@@ -213,13 +213,13 @@ program test
      & buhex,ceed_vector_active,err)
 
 ! Composite Operators
-      call ceedcompositeoperatorcreate(ceed,op_setup,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err)
+      call ceedoperatorcreatecomposite(ceed,op_setup,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err)
 
-      call ceedcompositeoperatorcreate(ceed,op_mass,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masstet,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masshex,err)
+      call ceedoperatorcreatecomposite(ceed,op_mass,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masstet,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masshex,err)
 
 ! Apply Setup Operator
       call ceedoperatorapply(op_setup,x,ceed_vector_none,&
diff --git a/tests/t521-operator.c b/tests/t521-operator.c
index 1fff943186..dd13ea5589 100644
--- a/tests/t521-operator.c
+++ b/tests/t521-operator.c
@@ -156,13 +156,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
 
   // Composite Operators
-  CeedCompositeOperatorCreate(ceed, &op_setup);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_tet);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_hex);
+  CeedOperatorCreateComposite(ceed, &op_setup);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_tet);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_hex);
 
-  CeedCompositeOperatorCreate(ceed, &op_mass);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_tet);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
+  CeedOperatorCreateComposite(ceed, &op_mass);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_tet);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_hex);
 
   // Apply Setup Operator
   CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE);
diff --git a/tests/t522-operator-f.f90 b/tests/t522-operator-f.f90
index 4ea3773f7b..98b9089edb 100644
--- a/tests/t522-operator-f.f90
+++ b/tests/t522-operator-f.f90
@@ -215,13 +215,13 @@ program test
      & buhex,ceed_vector_active,err)
 
 ! Composite Operators
-      call ceedcompositeoperatorcreate(ceed,op_setup,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err)
+      call ceedoperatorcreatecomposite(ceed,op_setup,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err)
 
-      call ceedcompositeoperatorcreate(ceed,op_diff,err)
-      call ceedcompositeoperatoraddsub(op_diff,op_difftet,err)
-      call ceedcompositeoperatoraddsub(op_diff,op_diffhex,err)
+      call ceedoperatorcreatecomposite(ceed,op_diff,err)
+      call ceedoperatorcompositeaddsub(op_diff,op_difftet,err)
+      call ceedoperatorcompositeaddsub(op_diff,op_diffhex,err)
 
 ! Apply Setup Operator
       call ceedoperatorapply(op_setup,x,ceed_vector_none,&
diff --git a/tests/t522-operator.c b/tests/t522-operator.c
index b2e1da90ac..8572c0d687 100644
--- a/tests/t522-operator.c
+++ b/tests/t522-operator.c
@@ -159,13 +159,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_diff_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
 
   // Composite Operators
-  CeedCompositeOperatorCreate(ceed, &op_setup);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_tet);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_hex);
+  CeedOperatorCreateComposite(ceed, &op_setup);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_tet);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_hex);
 
-  CeedCompositeOperatorCreate(ceed, &op_diff);
-  CeedCompositeOperatorAddSub(op_diff, op_diff_tet);
-  CeedCompositeOperatorAddSub(op_diff, op_diff_hex);
+  CeedOperatorCreateComposite(ceed, &op_diff);
+  CeedOperatorCompositeAddSub(op_diff, op_diff_tet);
+  CeedOperatorCompositeAddSub(op_diff, op_diff_hex);
 
   // Apply Setup Operator
   CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE);
diff --git a/tests/t522-operator.h b/tests/t522-operator.h
index 3f70b7d354..b594818bd1 100644
--- a/tests/t522-operator.h
+++ b/tests/t522-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *qw = in[0], *J = in[1];
diff --git a/tests/t523-operator-f.f90 b/tests/t523-operator-f.f90
index 0431b60ee7..fcd4504fe5 100644
--- a/tests/t523-operator-f.f90
+++ b/tests/t523-operator-f.f90
@@ -205,15 +205,16 @@ program test
      & buhex,ceed_vector_active,err)
 
 ! Composite Operators
-      call ceedcompositeoperatorcreate(ceed,op_setup,err)
+      call ceedoperatorcreatecomposite(ceed,op_setup,err)
       call ceedoperatorsetname(op_setup,'setup',err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err)
 
-      call ceedcompositeoperatorcreate(ceed,op_mass,err)
+      call ceedoperatorcreatecomposite(ceed,op_mass,err)
       call ceedoperatorsetname(op_mass,'mass',err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masstet,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masshex,err)
+      call ceedoperatorsetnumviewtabs(op_mass,1,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masstet,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masshex,err)
 
 ! View
       call ceedoperatorview(op_setup,err)
diff --git a/tests/t523-operator.c b/tests/t523-operator.c
index b882379516..a1e2307839 100644
--- a/tests/t523-operator.c
+++ b/tests/t523-operator.c
@@ -150,21 +150,24 @@ int main(int argc, char **argv) {
 
   // Set up Composite Operators
   // -- Create
-  CeedCompositeOperatorCreate(ceed, &op_setup);
+  CeedOperatorCreateComposite(ceed, &op_setup);
   CeedOperatorSetName(op_setup, "setup");
   // -- Add SubOperators
-  CeedCompositeOperatorAddSub(op_setup, op_setup_tet);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_hex);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_tet);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_hex);
 
   // -- Create
-  CeedCompositeOperatorCreate(ceed, &op_mass);
+  CeedOperatorCreateComposite(ceed, &op_mass);
   CeedOperatorSetName(op_mass, "mass");
   // -- Add SubOperators
-  CeedCompositeOperatorAddSub(op_mass, op_mass_tet);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_tet);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_hex);
 
   // View
+  CeedOperatorViewTerse(op_setup, stdout);
   CeedOperatorView(op_setup, stdout);
+  CeedOperatorSetNumViewTabs(op_mass, 1);
+  CeedOperatorViewTerse(op_mass, stdout);
   CeedOperatorView(op_mass, stdout);
 
   // Cleanup
diff --git a/tests/t524-operator-f.f90 b/tests/t524-operator-f.f90
index 4639442a5c..16b041c09a 100644
--- a/tests/t524-operator-f.f90
+++ b/tests/t524-operator-f.f90
@@ -215,13 +215,13 @@ program test
      & buhex,ceed_vector_active,err)
 
 ! Composite Operators
-      call ceedcompositeoperatorcreate(ceed,op_setup,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuptet,err)
-      call ceedcompositeoperatoraddsub(op_setup,op_setuphex,err)
+      call ceedoperatorcreatecomposite(ceed,op_setup,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuptet,err)
+      call ceedoperatorcompositeaddsub(op_setup,op_setuphex,err)
 
-      call ceedcompositeoperatorcreate(ceed,op_mass,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masstet,err)
-      call ceedcompositeoperatoraddsub(op_mass,op_masshex,err)
+      call ceedoperatorcreatecomposite(ceed,op_mass,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masstet,err)
+      call ceedoperatorcompositeaddsub(op_mass,op_masshex,err)
 
 ! Apply Setup Operator
       call ceedoperatorapply(op_setup,x,ceed_vector_none,&
diff --git a/tests/t524-operator.c b/tests/t524-operator.c
index fec0fe6ccd..3d61a563b3 100644
--- a/tests/t524-operator.c
+++ b/tests/t524-operator.c
@@ -155,13 +155,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
 
   // Composite Operators
-  CeedCompositeOperatorCreate(ceed, &op_setup);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_tet);
-  CeedCompositeOperatorAddSub(op_setup, op_setup_hex);
+  CeedOperatorCreateComposite(ceed, &op_setup);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_tet);
+  CeedOperatorCompositeAddSub(op_setup, op_setup_hex);
 
-  CeedCompositeOperatorCreate(ceed, &op_mass);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_tet);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
+  CeedOperatorCreateComposite(ceed, &op_mass);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_tet);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_hex);
 
   // Apply Setup Operator
   CeedOperatorApply(op_setup, x, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE);
diff --git a/tests/t525-operator.c b/tests/t525-operator.c
index 9d0d80442f..bed1365a77 100644
--- a/tests/t525-operator.c
+++ b/tests/t525-operator.c
@@ -73,9 +73,9 @@ int main(int argc, char **argv) {
   CeedOperatorCreate(ceed, qf_sub_2, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_sub_2);
 
   // Composite operator
-  CeedCompositeOperatorCreate(ceed, &op_composite);
-  CeedCompositeOperatorAddSub(op_composite, op_sub_1);
-  CeedCompositeOperatorAddSub(op_composite, op_sub_2);
+  CeedOperatorCreateComposite(ceed, &op_composite);
+  CeedOperatorCompositeAddSub(op_composite, op_sub_1);
+  CeedOperatorCompositeAddSub(op_composite, op_sub_2);
 
   // Check setting field in context of single sub-operator for composite operator
   CeedOperatorGetContextFieldLabel(op_composite, "time", &time_label);
@@ -112,6 +112,7 @@ int main(int argc, char **argv) {
 
     CeedOperatorGetContext(op_sub_1, &ctx_copy);
     if (ctx_copy != qf_ctx_sub_1) printf("Incorrect QFunctionContext retrieved");
+    CeedQFunctionContextDestroy(&ctx_copy);
 
     CeedOperatorGetContext(op_sub_2, &ctx_copy);  // Destroys reference to qf_ctx_sub_1
     if (ctx_copy != qf_ctx_sub_2) printf("Incorrect QFunctionContext retrieved");
diff --git a/tests/t526-operator.c b/tests/t526-operator.c
index 6d66590d15..8e68ab89b3 100644
--- a/tests/t526-operator.c
+++ b/tests/t526-operator.c
@@ -114,10 +114,10 @@ int main(int argc, char **argv) {
 
   // Set up Composite Operator
   // -- Create
-  CeedCompositeOperatorCreate(ceed, &op_mass);
+  CeedOperatorCreateComposite(ceed, &op_mass);
   // -- Add SubOperators
-  CeedCompositeOperatorAddSub(op_mass, op_mass_tet);
-  CeedCompositeOperatorAddSub(op_mass, op_mass_hex);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_tet);
+  CeedOperatorCompositeAddSub(op_mass, op_mass_hex);
 
   // Estimate FLOPs
   CeedQFunctionSetUserFlopsEstimate(qf_mass, 1);
diff --git a/tests/t530-operator.c b/tests/t530-operator.c
index d9d18083b0..60716e4544 100644
--- a/tests/t530-operator.c
+++ b/tests/t530-operator.c
@@ -94,12 +94,13 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(qf_assembled, CEED_MEM_HOST, &assembled_array);
     CeedVectorGetArrayRead(q_data, CEED_MEM_HOST, &q_data_array);
-    for (CeedInt i = 0; i < num_qpts; i++)
+    for (CeedInt i = 0; i < num_qpts; i++) {
       if (fabs(q_data_array[i] - assembled_array[i]) > 1e-9) {
         // LCOV_EXCL_START
         printf("Error: qf_assembled[%" CeedInt_FMT "] = %f != %f\n", i, assembled_array[i], q_data_array[i]);
         // LCOV_EXCL_STOP
       }
+    }
     CeedVectorRestoreArrayRead(qf_assembled, &assembled_array);
     CeedVectorRestoreArrayRead(q_data, &q_data_array);
   }
diff --git a/tests/t530-operator.h b/tests/t530-operator.h
index 01cf47450c..171f9d01df 100644
--- a/tests/t530-operator.h
+++ b/tests/t530-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *J = in[1];
diff --git a/tests/t531-operator-f.f90 b/tests/t531-operator-f.f90
index d78dd92351..a2deb4434d 100644
--- a/tests/t531-operator-f.f90
+++ b/tests/t531-operator-f.f90
@@ -15,7 +15,7 @@ program test
       integer bx,bu
       integer qf_setup,qf_diff,qf_diff_lin
       integer op_setup,op_diff,op_diff_lin
-      integer qdata,x,a,u,v
+      integer qdata,x,a,u,v,v_lin
       integer nelem,p,q,d
       integer row,col,offset
       parameter(nelem=6)
@@ -28,8 +28,8 @@ program test
       parameter(ndofs=(nx*2+1)*(ny*2+1))
       parameter(nqpts=nelem*q*q)
       integer indx(nelem*p*p)
-      real*8 arrx(d*ndofs),vv(ndofs)
-      integer*8 xoffset,voffset
+      real*8 arrx(d*ndofs),uu(ndofs),vv(ndofs),vvlin(ndofs)
+      integer*8 xoffset,uoffset,voffset,vlinoffset
 
       character arg*32
 
@@ -42,14 +42,26 @@ program test
 ! DoF Coordinates
       do i=0,nx*2
         do j=0,ny*2
-          arrx(i+j*(nx*2+1)+0*ndofs+1)=1.d0*i/(2*nx)
-          arrx(i+j*(nx*2+1)+1*ndofs+1)=1.d0*j/(2*ny)
+          arrx(i+j*(nx*2+1)+0*ndofs+1)=1.d0*i/(2*nx)+j*0.5
+          arrx(i+j*(nx*2+1)+1*ndofs+1)=1.d0*j/(2*ny)+i*0.5
         enddo
       enddo
       call ceedvectorcreate(ceed,d*ndofs,x,err)
       xoffset=0
       call ceedvectorsetarray(x,ceed_mem_host,ceed_use_pointer,arrx,xoffset,err)
 
+! Input, output arrays
+      do i=0,nx*2
+        do j=0,ny*2
+          uu(i+j*(nx*2+1)+1)=i*nx+j*ny
+        enddo
+      enddo
+      call ceedvectorcreate(ceed,ndofs,u,err)
+      uoffset=0
+      call ceedvectorsetarray(u,ceed_mem_host,ceed_use_pointer,uu,uoffset,err)
+      call ceedvectorcreate(ceed,ndofs,v,err)
+      call ceedvectorcreate(ceed,ndofs,v_lin,err)
+
 ! Qdata Vector
       call ceedvectorcreate(ceed,nqpts*d*(d+1)/2,qdata,err)
 
@@ -125,23 +137,8 @@ program test
      & bu,ceed_vector_active,err)
 
 ! Apply original Poisson Operator
-      call ceedvectorcreate(ceed,ndofs,u,err)
-      call ceedvectorsetvalue(u,1.d0,err)
-      call ceedvectorcreate(ceed,ndofs,v,err)
-      call ceedvectorsetvalue(v,0.d0,err)
       call ceedoperatorapply(op_diff,u,v,ceed_request_immediate,err)
 
-! Check Output
-      call ceedvectorgetarrayread(v,ceed_mem_host,vv,voffset,err)
-      do i=1,ndofs
-      if (abs(vv(voffset+i))>1.0d-14) then
-! LCOV_EXCL_START
-        write(*,*) 'Error: Operator computed v[i] = ',vv(voffset+i),' != 0.0'
-! LCOV_EXCL_STOP
-      endif
-      enddo
-      call ceedvectorrestorearrayread(v,vv,voffset,err)
-
 ! Assemble QFunction
       call ceedoperatorlinearassembleqfunction(op_diff,a,erestrictlini,&
      & ceed_request_immediate,err)
@@ -165,20 +162,21 @@ program test
      & bu,ceed_vector_active,err)
 
 ! Apply linearized Poisson Operator
-      call ceedvectorsetvalue(v,0.d0,err)
-      call ceedoperatorapply(op_diff_lin,u,v,ceed_request_immediate,err)
+      call ceedoperatorapply(op_diff_lin,u,v_lin,ceed_request_immediate,err)
 
 ! Check Output
       call ceedvectorgetarrayread(v,ceed_mem_host,vv,voffset,err)
+      call ceedvectorgetarrayread(v_lin,ceed_mem_host,vvlin,vlinoffset,err)
       do i=1,ndofs
-      if (abs(vv(voffset+i))>1.0d-14) then
+      if (abs(vv(voffset+i)-vvlin(vlinoffset+i))>1.0d-14) then
 ! LCOV_EXCL_START
         write(*,*) 'Error: Linearized operator computed v[i] = ',vv(voffset+i),&
-     &   ' != 0.0'
+     &   ' != ',vvlin(vlinoffset+i)
 ! LCOV_EXCL_STOP
       endif
       enddo
       call ceedvectorrestorearrayread(v,vv,voffset,err)
+      call ceedvectorrestorearrayread(v_lin,vvlin,vlinoffset,err)
 
 ! Cleanup
       call ceedqfunctiondestroy(qf_setup,err)
@@ -198,6 +196,7 @@ program test
       call ceedvectordestroy(a,err)
       call ceedvectordestroy(u,err)
       call ceedvectordestroy(v,err)
+      call ceedvectordestroy(v_lin,err)
       call ceedvectordestroy(qdata,err)
       call ceeddestroy(ceed,err)
       end
diff --git a/tests/t531-operator-f.h b/tests/t531-operator-f.h
index 20f02ea332..590140632d 100644
--- a/tests/t531-operator-f.h
+++ b/tests/t531-operator-f.h
@@ -11,8 +11,8 @@
       do i=1,q
         w=u2(i)/(u1(i+q*0)*u1(i+q*3)-u1(i+q*1)*u1(i+q*2))
         v1(i+q*0)=w*(u1(i+q*2)*u1(i+q*2)+u1(i+q*3)*u1(i+q*3))
-        v1(i+q*1)=-w*(u1(i+q*0)*u1(i+q*2)+u1(i+q*2)*u1(i+q*3))
-        v1(i+q*2)=w*(u1(i+q*0)*u1(i+q*0)+u1(i+q*1)*u1(i+q*1))
+        v1(i+q*1)=w*(u1(i+q*0)*u1(i+q*0)+u1(i+q*1)*u1(i+q*1))
+        v1(i+q*2)=-w*(u1(i+q*0)*u1(i+q*2)+u1(i+q*2)*u1(i+q*3))
       enddo
 
       ierr=0
diff --git a/tests/t531-operator.c b/tests/t531-operator.c
index 9462d49323..b0e09caf8b 100644
--- a/tests/t531-operator.c
+++ b/tests/t531-operator.c
@@ -14,7 +14,7 @@ int main(int argc, char **argv) {
   CeedBasis           basis_x, basis_u;
   CeedQFunction       qf_setup, qf_diff, qf_diff_assembled;
   CeedOperator        op_setup, op_diff, op_diff_assembled;
-  CeedVector          q_data, x, assembled = NULL, u, v;
+  CeedVector          q_data, x, assembled = NULL, u, v, v_assembled;
   CeedInt             num_elem = 6, p = 3, q = 4, dim = 2;
   CeedInt             nx = 3, ny = 2;
   CeedInt             num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts = num_elem * q * q;
@@ -29,14 +29,26 @@ int main(int argc, char **argv) {
 
     for (CeedInt i = 0; i < nx * 2 + 1; i++) {
       for (CeedInt j = 0; j < ny * 2 + 1; j++) {
-        x_array[i + j * (nx * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * nx);
-        x_array[i + j * (nx * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * ny);
+        x_array[i + j * (nx * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * nx) + 0.5 * j;
+        x_array[i + j * (nx * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * ny) + 0.5 * i;
       }
     }
     CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
   }
   CeedVectorCreate(ceed, num_dofs, &u);
+  {
+    CeedScalar *u_array;
+
+    CeedVectorGetArrayWrite(u, CEED_MEM_HOST, &u_array);
+    for (CeedInt i = 0; i < nx * 2 + 1; i++) {
+      for (CeedInt j = 0; j < ny * 2 + 1; j++) {
+        u_array[i + j * (nx * 2 + 1)] = i * nx + j * ny;
+      }
+    }
+    CeedVectorRestoreArray(u, &u_array);
+  }
   CeedVectorCreate(ceed, num_dofs, &v);
+  CeedVectorCreate(ceed, num_dofs, &v_assembled);
   CeedVectorCreate(ceed, num_qpts * dim * (dim + 1) / 2, &q_data);
 
   // Restrictions
@@ -88,20 +100,8 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
 
   // Apply original Poisson Operator
-  CeedVectorSetValue(u, 1.0);
   CeedOperatorApply(op_diff, u, v, CEED_REQUEST_IMMEDIATE);
 
-  // Check output
-  {
-    const CeedScalar *v_array;
-
-    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
-    for (CeedInt i = 0; i < num_dofs; i++) {
-      if (fabs(v_array[i]) > 100. * CEED_EPSILON) printf("Error: Operator computed v[%" CeedInt_FMT "] = %f != 0.0\n", i, v_array[i]);
-    }
-    CeedVectorRestoreArrayRead(v, &v_array);
-  }
-
   // Assemble QFunction
   CeedOperatorSetQFunctionAssemblyReuse(op_diff, true);
   CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_diff, &assembled, &elem_restriction_assembled, CEED_REQUEST_IMMEDIATE);
@@ -122,18 +122,23 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_diff_assembled, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
 
   // Apply new Poisson Operator
-  CeedVectorSetValue(v, 0.0);
-  CeedOperatorApply(op_diff_assembled, u, v, CEED_REQUEST_IMMEDIATE);
+  CeedOperatorApply(op_diff_assembled, u, v_assembled, CEED_REQUEST_IMMEDIATE);
 
   // Check output
   {
-    const CeedScalar *v_array;
+    const CeedScalar *v_array, *v_assembled_array;
 
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    CeedVectorGetArrayRead(v_assembled, CEED_MEM_HOST, &v_assembled_array);
     for (CeedInt i = 0; i < num_dofs; i++) {
-      if (fabs(v_array[i]) > 100. * CEED_EPSILON) printf("Error: Linearized operator computed v[i] = %f != 0.0\n", v_array[i]);
+      if (fabs(v_array[i] - v_assembled_array[i]) > 100. * CEED_EPSILON) {
+        // LCOV_EXCL_START
+        printf("Error: Linearized operator computed v[%d] = %f != %f\n", i, v_assembled_array[i], v_array[i]);
+        // LCOV_EXCL_STOP
+      }
     }
     CeedVectorRestoreArrayRead(v, &v_array);
+    CeedVectorRestoreArrayRead(v_assembled, &v_assembled_array);
   }
 
   // Cleanup
@@ -142,6 +147,7 @@ int main(int argc, char **argv) {
   CeedVectorDestroy(&q_data);
   CeedVectorDestroy(&u);
   CeedVectorDestroy(&v);
+  CeedVectorDestroy(&v_assembled);
   CeedElemRestrictionDestroy(&elem_restriction_u);
   CeedElemRestrictionDestroy(&elem_restriction_x);
   CeedElemRestrictionDestroy(&elem_restriction_q_data);
diff --git a/tests/t531-operator.h b/tests/t531-operator.h
index a9f69f6bd5..f1c3ccab25 100644
--- a/tests/t531-operator.h
+++ b/tests/t531-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store
@@ -28,8 +28,8 @@ CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, C
     const CeedScalar J22 = J[i + Q * 3];
     const CeedScalar w   = qw[i] / (J11 * J22 - J21 * J12);
     qd[i + Q * 0]        = w * (J12 * J12 + J22 * J22);
-    qd[i + Q * 2]        = w * (J11 * J11 + J21 * J21);
-    qd[i + Q * 1]        = -w * (J11 * J12 + J21 * J22);
+    qd[i + Q * 1]        = w * (J11 * J11 + J21 * J21);
+    qd[i + Q * 2]        = -w * (J11 * J12 + J21 * J22);
   }
 
   return 0;
@@ -50,7 +50,6 @@ CEED_QFUNCTION(diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, Ce
     dv[i + Q * 0]        = qd[i + Q * 0] * du0 + qd[i + Q * 2] * du1;
     dv[i + Q * 1]        = qd[i + Q * 2] * du0 + qd[i + Q * 1] * du1;
   }
-
   return 0;
 }
 
diff --git a/tests/t532-operator.h b/tests/t532-operator.h
index e15e3aed19..b81f87dbc6 100644
--- a/tests/t532-operator.h
+++ b/tests/t532-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *J = in[0], *weight = in[1];
diff --git a/tests/t533-operator.c b/tests/t533-operator.c
index a01dabda4a..4ac1c523de 100644
--- a/tests/t533-operator.c
+++ b/tests/t533-operator.c
@@ -28,11 +28,12 @@ int main(int argc, char **argv) {
   {
     CeedScalar x_array[dim * num_dofs];
 
-    for (CeedInt i = 0; i < nx * 2 + 1; i++)
+    for (CeedInt i = 0; i < nx * 2 + 1; i++) {
       for (CeedInt j = 0; j < ny * 2 + 1; j++) {
         x_array[i + j * (nx * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * nx);
         x_array[i + j * (nx * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * ny);
       }
+    }
     CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
   }
   CeedVectorCreate(ceed, num_dofs, &u);
@@ -45,8 +46,9 @@ int main(int argc, char **argv) {
     col    = i % nx;
     row    = i / nx;
     offset = col * (p - 1) + row * (nx * 2 + 1) * (p - 1);
-    for (CeedInt j = 0; j < p; j++)
+    for (CeedInt j = 0; j < p; j++) {
       for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (nx * 2 + 1) + j;
+    }
   }
   CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
   CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_u);
@@ -89,7 +91,7 @@ int main(int argc, char **argv) {
 
   // Manually assemble diagonal
   CeedVectorSetValue(u, 0.0);
-  for (int i = 0; i < num_dofs; i++) {
+  for (CeedInt i = 0; i < num_dofs; i++) {
     CeedScalar       *u_array;
     const CeedScalar *v_array;
 
@@ -113,7 +115,7 @@ int main(int argc, char **argv) {
     const CeedScalar *assembled_array;
 
     CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array);
-    for (int i = 0; i < num_dofs; i++) {
+    for (CeedInt i = 0; i < num_dofs; i++) {
       if (fabs(assembled_array[i] - assembled_true[i]) > 100. * CEED_EPSILON) {
         // LCOV_EXCL_START
         printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, assembled_array[i], assembled_true[i]);
diff --git a/tests/t534-operator.h b/tests/t534-operator.h
index 3fc4c58887..518481a070 100644
--- a/tests/t534-operator.h
+++ b/tests/t534-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store
diff --git a/tests/t535-operator.h b/tests/t535-operator.h
index 7f6797608c..fc62a6ca0d 100644
--- a/tests/t535-operator.h
+++ b/tests/t535-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *J = in[0], *weight = in[1];
diff --git a/tests/t537-operator.h b/tests/t537-operator.h
index 80b2d22d73..f08c690d12 100644
--- a/tests/t537-operator.h
+++ b/tests/t537-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *J = in[1];
diff --git a/tests/t538-operator.c b/tests/t538-operator.c
index 45e86ecdff..0e5267019c 100644
--- a/tests/t538-operator.c
+++ b/tests/t538-operator.c
@@ -104,9 +104,9 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
 
   // Composite operator
-  CeedCompositeOperatorCreate(ceed, &op_apply);
-  CeedCompositeOperatorAddSub(op_apply, op_mass);
-  CeedCompositeOperatorAddSub(op_apply, op_diff);
+  CeedOperatorCreateComposite(ceed, &op_apply);
+  CeedOperatorCompositeAddSub(op_apply, op_mass);
+  CeedOperatorCompositeAddSub(op_apply, op_diff);
 
   // Assemble diagonal
   CeedVectorCreate(ceed, num_dofs, &assembled);
diff --git a/tests/t539-operator.h b/tests/t539-operator.h
index 3a4fda2475..65eaa85554 100644
--- a/tests/t539-operator.h
+++ b/tests/t539-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(apply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is gradient u_0, shape [2, num_comp=2, Q]
diff --git a/tests/t540-operator.h b/tests/t540-operator.h
index 79f5006719..0259af529c 100644
--- a/tests/t540-operator.h
+++ b/tests/t540-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *J = in[0], *weight = in[1];
diff --git a/tests/t541-operator.h b/tests/t541-operator.h
index 7eaa675c97..a8a3424f78 100644
--- a/tests/t541-operator.h
+++ b/tests/t541-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // in[0] is Jacobians with shape [2, nc=2, Q]
diff --git a/tests/t554-operator.c b/tests/t554-operator.c
index 0ca19605a9..d63c548696 100644
--- a/tests/t554-operator.c
+++ b/tests/t554-operator.c
@@ -33,10 +33,10 @@ int main(int argc, char **argv) {
   CeedVectorCreate(ceed, num_comp * num_dofs_u_fine, &v_fine);
 
   // Composite operators
-  CeedCompositeOperatorCreate(ceed, &op_mass_coarse);
-  CeedCompositeOperatorCreate(ceed, &op_mass_fine);
-  CeedCompositeOperatorCreate(ceed, &op_prolong);
-  CeedCompositeOperatorCreate(ceed, &op_restrict);
+  CeedOperatorCreateComposite(ceed, &op_mass_coarse);
+  CeedOperatorCreateComposite(ceed, &op_mass_fine);
+  CeedOperatorCreateComposite(ceed, &op_prolong);
+  CeedOperatorCreateComposite(ceed, &op_restrict);
 
   // Setup fine suboperators
   for (CeedInt i = 0; i < num_sub_ops; i++) {
@@ -99,7 +99,7 @@ int main(int argc, char **argv) {
     CeedOperatorApply(sub_op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
 
     // -- Composite operators
-    CeedCompositeOperatorAddSub(op_mass_fine, sub_op_mass_fine);
+    CeedOperatorCompositeAddSub(op_mass_fine, sub_op_mass_fine);
 
     // -- Cleanup
     CeedVectorDestroy(&q_data);
@@ -116,7 +116,7 @@ int main(int argc, char **argv) {
 
   // Scale for suboperator multiplicity
   CeedVectorCreate(ceed, num_comp * num_dofs_u_fine, &p_mult_fine);
-  CeedCompositeOperatorGetMultiplicity(op_mass_fine, 0, NULL, p_mult_fine);
+  CeedOperatorCompositeGetMultiplicity(op_mass_fine, 0, NULL, p_mult_fine);
 
   // Setup coarse and prolong/restriction suboperators
   for (CeedInt i = 0; i < num_sub_ops; i++) {
@@ -125,7 +125,7 @@ int main(int argc, char **argv) {
     CeedOperator       *sub_ops_mass_fine, sub_op_mass_coarse, sub_op_prolong, sub_op_restrict;
 
     // -- Fine grid operator
-    CeedCompositeOperatorGetSubList(op_mass_fine, &sub_ops_mass_fine);
+    CeedOperatorCompositeGetSubList(op_mass_fine, &sub_ops_mass_fine);
 
     // -- Restrictions
     CeedInt offset = num_elem_sub * i * (p_coarse - 1);
@@ -145,9 +145,9 @@ int main(int argc, char **argv) {
                                      &sub_op_prolong, &sub_op_restrict);
 
     // -- Composite operators
-    CeedCompositeOperatorAddSub(op_mass_coarse, sub_op_mass_coarse);
-    CeedCompositeOperatorAddSub(op_prolong, sub_op_prolong);
-    CeedCompositeOperatorAddSub(op_restrict, sub_op_restrict);
+    CeedOperatorCompositeAddSub(op_mass_coarse, sub_op_mass_coarse);
+    CeedOperatorCompositeAddSub(op_prolong, sub_op_prolong);
+    CeedOperatorCompositeAddSub(op_restrict, sub_op_restrict);
 
     // -- Cleanup
     CeedElemRestrictionDestroy(&elem_restriction_u_coarse);
diff --git a/tests/t565-operator.c b/tests/t565-operator.c
index b5a542451f..8ed3e0ea5f 100644
--- a/tests/t565-operator.c
+++ b/tests/t565-operator.c
@@ -107,9 +107,9 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
 
   // Composite operator
-  CeedCompositeOperatorCreate(ceed, &op_apply);
-  CeedCompositeOperatorAddSub(op_apply, op_mass);
-  CeedCompositeOperatorAddSub(op_apply, op_diff);
+  CeedOperatorCreateComposite(ceed, &op_apply);
+  CeedOperatorCompositeAddSub(op_apply, op_mass);
+  CeedOperatorCompositeAddSub(op_apply, op_diff);
 
   // Fully assemble operator
   CeedSize   num_entries;
diff --git a/tests/t566-operator.h b/tests/t566-operator.h
index dfd0da43a2..c227b7d834 100644
--- a/tests/t566-operator.h
+++ b/tests/t566-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *weight = in[0], *J = in[1];
diff --git a/tests/t567-operator.h b/tests/t567-operator.h
index 6b645272dc..997b6db1bb 100644
--- a/tests/t567-operator.h
+++ b/tests/t567-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *w = in[0], (*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[1];
diff --git a/tests/t568-operator.h b/tests/t568-operator.h
index d52bc2d800..6c38bb04c7 100644
--- a/tests/t568-operator.h
+++ b/tests/t568-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store
diff --git a/tests/t580-operator.h b/tests/t580-operator.h
index 940a3605fc..cb7e472fba 100644
--- a/tests/t580-operator.h
+++ b/tests/t580-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 // Compute det(A)
 CEED_QFUNCTION_HELPER CeedScalar MatDet2x2(const CeedScalar A[2][2]) { return A[0][0] * A[1][1] - A[1][0] * A[0][1]; }
diff --git a/tests/t590-operator.h b/tests/t590-operator.h
index a2018718f8..c50595bc26 100644
--- a/tests/t590-operator.h
+++ b/tests/t590-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar *u = in[0];
diff --git a/tests/t591-operator.h b/tests/t591-operator.h
index 1c64f1181f..0a834e5056 100644
--- a/tests/t591-operator.h
+++ b/tests/t591-operator.h
@@ -1,11 +1,11 @@
-// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
 //
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
 
-#include <ceed.h>
+#include <ceed/types.h>
 
 CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
   const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0];
diff --git a/tests/t592-operator.c b/tests/t592-operator.c
index e0ccdace2e..1650e0fa89 100644
--- a/tests/t592-operator.c
+++ b/tests/t592-operator.c
@@ -1,6 +1,6 @@
 /// @file
 /// Test assembly of mass matrix operator QFunction at points
-/// \test Test assembly of mass matrix operator QFunction
+/// \test Test assembly of mass matrix operator QFunction at points
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
@@ -173,12 +173,13 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(qf_assembled, CEED_MEM_HOST, &assembled_array);
     CeedVectorGetArrayRead(q_data, CEED_MEM_HOST, &q_data_array);
-    for (CeedInt i = 0; i < num_points; i++)
+    for (CeedInt i = 0; i < num_points; i++) {
       if (fabs(q_data_array[i] - assembled_array[i]) > 1e-9) {
         // LCOV_EXCL_START
         printf("Error: qf_assembled[%" CeedInt_FMT "] = %f != %f\n", i, assembled_array[i], q_data_array[i]);
         // LCOV_EXCL_STOP
       }
+    }
     CeedVectorRestoreArrayRead(qf_assembled, &assembled_array);
     CeedVectorRestoreArrayRead(q_data, &q_data_array);
   }
diff --git a/tests/t593-operator.c b/tests/t593-operator.c
index 5b145d0884..2e0710c7fc 100644
--- a/tests/t593-operator.c
+++ b/tests/t593-operator.c
@@ -1,5 +1,5 @@
 /// @file
-/// Bug reproducer for memcheck backends at points
+/// Test 1D mass matrix operator at points with heterogeneous points per element
 /// \test Test 1D mass matrix operator at points with heterogeneous points per element
 #include <ceed.h>
 #include <math.h>
@@ -85,13 +85,13 @@ int main(int argc, char **argv) {
 
   // Setup geometric scaling
   CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
-  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
   CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
   CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
 
   CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
-  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
diff --git a/tests/t594-operator.c b/tests/t594-operator.c
new file mode 100644
index 0000000000..49405e37a4
--- /dev/null
+++ b/tests/t594-operator.c
@@ -0,0 +1,179 @@
+/// @file
+/// Test diagonal assembly of mass matrix operator at points
+/// \test Test diagonal assembly of mass matrix operator at points
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t500-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed       ceed;
+  CeedInt    num_elem = 3, dim = 1, p = 3, q = 5;
+  CeedInt    num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1, num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedInt    ind_x[num_elem * 2], ind_u[num_elem * p], ind_x_points[num_elem + 1 + num_points];
+  CeedScalar x_array_mesh[num_nodes_x], x_array_points[num_points], assembled_true[num_nodes_u];
+  CeedVector x_points = NULL, x_elem = NULL, q_data = NULL, u = NULL, v = NULL, assembled = NULL;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass;
+  bool                is_at_points;
+
+  CeedInit(argv[1], &ceed);
+
+  // Mesh coordinates
+  for (CeedInt i = 0; i < num_nodes_x; i++) x_array_mesh[i] = (CeedScalar)i / (num_nodes_x - 1);
+  for (CeedInt i = 0; i < num_elem; i++) {
+    ind_x[2 * i + 0] = i;
+    ind_x[2 * i + 1] = i + 1;
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, 2, 1, 1, num_nodes_x, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+  CeedVectorCreate(ceed, num_nodes_x, &x_elem);
+  CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_USE_POINTER, x_array_mesh);
+
+  // U mesh
+  for (CeedInt i = 0; i < num_elem; i++) {
+    for (CeedInt j = 0; j < p; j++) {
+      ind_u[p * i + j] = i * (p - 1) + j;
+    }
+  }
+  CeedElemRestrictionCreate(ceed, num_elem, p, 1, 1, num_nodes_u, CEED_MEM_HOST, CEED_USE_POINTER, ind_u, &elem_restriction_u);
+
+  // Point reference coordinates
+  {
+    CeedScalar weight_tmp[num_points_per_elem + 1];
+    CeedInt    current_index = 0;
+
+    // Use num_points_per_elem + 1 to test non-uniform quadrature
+    CeedGaussQuadrature(num_points_per_elem + 1, x_array_points, weight_tmp);
+    ind_x_points[0] = num_elem + 1;
+    for (CeedInt p = 0; p < num_points_per_elem + 1; p++, current_index++) {
+      ind_x_points[num_elem + 1 + current_index] = current_index;
+    }
+    // Use num_points_per_elem for middle elements
+    for (CeedInt e = 1; e < num_elem - 1; e++) {
+      CeedGaussQuadrature(num_points_per_elem, &x_array_points[current_index], weight_tmp);
+      ind_x_points[e] = num_elem + 1 + current_index;
+      for (CeedInt p = 0; p < num_points_per_elem; p++, current_index++) {
+        ind_x_points[num_elem + 1 + current_index] = current_index;
+      }
+    }
+    // Use num_points_per_elem - 1 to test non-uniform quadrature
+    CeedGaussQuadrature(num_points_per_elem - 1, &x_array_points[current_index], weight_tmp);
+    ind_x_points[num_elem - 1] = num_elem + 1 + current_index;
+    for (CeedInt p = 0; p < num_points_per_elem - 1; p++, current_index++) {
+      ind_x_points[num_elem + 1 + current_index] = current_index;
+    }
+    ind_x_points[num_elem] = num_elem + 1 + current_index;
+
+    CeedVectorCreate(ceed, num_elem * num_points_per_elem, &x_points);
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_USE_POINTER, x_array_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x_points,
+                                      &elem_restriction_q_data);
+
+    // Q data
+    CeedVectorCreate(ceed, num_points, &q_data);
+  }
+
+  // Basis creation
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x);
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+
+  // Setup geometric scaling
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+  CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Mass operator
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_mass, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  CeedVectorCreate(ceed, num_nodes_u, &u);
+  CeedVectorSetValue(u, 0.0);
+  CeedVectorCreate(ceed, num_nodes_u, &v);
+
+  // Assemble diagonal
+  CeedVectorCreate(ceed, num_nodes_u, &assembled);
+  CeedOperatorLinearAssembleDiagonal(op_mass, assembled, CEED_REQUEST_IMMEDIATE);
+
+  // Manually assemble diagonal
+  CeedVectorSetValue(u, 0.0);
+  for (CeedInt i = 0; i < num_nodes_u; i++) {
+    CeedScalar       *u_array;
+    const CeedScalar *v_array;
+
+    // Set input
+    CeedVectorGetArray(u, CEED_MEM_HOST, &u_array);
+    u_array[i] = 1.0;
+    if (i) u_array[i - 1] = 0.0;
+    CeedVectorRestoreArray(u, &u_array);
+
+    // Compute diag entry for DoF i
+    CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+    // Retrieve entry
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    assembled_true[i] = v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+  }
+
+  // Check output
+  {
+    const CeedScalar *assembled_array;
+
+    CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array);
+    for (CeedInt i = 0; i < num_nodes_u; i++) {
+      if (fabs(assembled_array[i] - assembled_true[i]) > 100. * CEED_EPSILON) {
+        // LCOV_EXCL_START
+        printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, assembled_array[i], assembled_true[i]);
+        // LCOV_EXCL_STOP
+      }
+    }
+    CeedVectorRestoreArrayRead(assembled, &assembled_array);
+  }
+
+  // Cleanup
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&x_elem);
+  CeedVectorDestroy(&assembled);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t595-operator.c b/tests/t595-operator.c
new file mode 100644
index 0000000000..e874ccb2ba
--- /dev/null
+++ b/tests/t595-operator.c
@@ -0,0 +1,125 @@
+/// @file
+/// Test FLOP estimation for mass matrix operator at points
+/// \test Test FLOP estimation for mass matrix operator at points
+#include "t595-operator.h"
+
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char **argv) {
+  Ceed    ceed;
+  CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p = 3, q = 5;
+  CeedInt num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedSize            flop_estimate = 0;
+  CeedVector          x_points, q_data;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_u;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_mass;
+  CeedOperator        op_mass;
+  bool                is_at_points;
+
+  CeedInit(argv[1], &ceed);
+
+  // Point reference coordinates
+  CeedVectorCreate(ceed, dim * num_points, &x_points);
+  {
+    CeedScalar x_array[dim * num_points];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      for (CeedInt d = 0; d < dim; d++) {
+        x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+      }
+    }
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  {
+    CeedInt ind_x[num_elem + 1 + num_points];
+
+    for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+    for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+  }
+
+  // Q data
+  CeedVectorCreate(ceed, num_points, &q_data);
+
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x);
+
+  // Cell solution
+  {
+    CeedInt ind_u[num_elem * p * p];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, &elem_restriction_u);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+
+  // Mass operator
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_mass, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  // Estimate FLOPs
+  CeedQFunctionSetUserFlopsEstimate(qf_mass, 1);
+  CeedOperatorGetFlopsEstimate(op_mass, &flop_estimate);
+
+  // Check output
+  if (flop_estimate != 16317) {
+    // LCOV_EXCL_START
+    printf("Incorrect FLOP estimate computed, %ld != 16317\n", flop_estimate);
+    // LCOV_EXCL_STOP
+  }
+
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t595-operator.h b/tests/t595-operator.h
new file mode 100644
index 0000000000..a5ddb3b9d7
--- /dev/null
+++ b/tests/t595-operator.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar *u = in[0], *rho = in[1];
+  CeedScalar       *v = out[0];
+
+  // Quadrature point loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = rho[i] * u[i]; }
+  return 0;
+}
diff --git a/tests/t596-operator.c b/tests/t596-operator.c
new file mode 100644
index 0000000000..81ca865ebd
--- /dev/null
+++ b/tests/t596-operator.c
@@ -0,0 +1,202 @@
+/// @file
+/// Test full assembly of mass matrix operator
+/// \test Test full assembly of mass matrix operator AtPoints
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t596-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) {
+    CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data;
+    CeedBasis           basis_x, basis_u;
+    CeedQFunction       qf_setup, qf_mass;
+    CeedOperator        op_setup, op_mass;
+    CeedVector          q_data, x, x_points, u, v;
+    CeedInt             p = 3, q = 4, dim = 2;
+    CeedInt             n_x = 3, n_y = 2;
+    CeedInt             num_elem = n_x * n_y;
+    CeedInt             num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+    CeedInt             ind_x[num_elem * p * p];
+    CeedScalar          assembled_values[num_comp * num_comp * num_dofs * num_dofs];
+    CeedScalar          assembled_true[num_comp * num_comp * num_dofs * num_dofs];
+
+    // Points
+    CeedVectorCreate(ceed, dim * num_points, &x_points);
+    {
+      CeedScalar x_array[dim * num_points];
+
+      for (CeedInt e = 0; e < num_elem; e++) {
+        for (CeedInt d = 0; d < dim; d++) {
+          x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+        }
+      }
+      CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    {
+      CeedInt ind_x[num_elem + 1 + num_points];
+
+      for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+      for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+      CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                        &elem_restriction_x_points);
+      CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+    }
+
+    // Vectors
+    CeedVectorCreate(ceed, dim * num_dofs, &x);
+    {
+      CeedScalar x_array[dim * num_dofs];
+
+      for (CeedInt i = 0; i < n_x * 2 + 1; i++) {
+        for (CeedInt j = 0; j < n_y * 2 + 1; j++) {
+          x_array[i + j * (n_x * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_x);
+          x_array[i + j * (n_x * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_y);
+        }
+      }
+      CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    CeedVectorCreate(ceed, num_comp * num_dofs, &u);
+    CeedVectorCreate(ceed, num_comp * num_dofs, &v);
+    CeedVectorCreate(ceed, num_points, &q_data);
+
+    // Restrictions
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt col, row, offset;
+      col    = i % n_x;
+      row    = i / n_x;
+      offset = col * (p - 1) + row * (n_x * 2 + 1) * (p - 1);
+      for (CeedInt j = 0; j < p; j++) {
+        for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (n_x * 2 + 1) + j;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, num_comp, num_dofs, num_comp * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x,
+                              &elem_restriction_u);
+
+    // Bases
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p, q, CEED_GAUSS, &basis_x);
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis_u);
+
+    // QFunctions
+    CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+    CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddInput(qf_setup, "dx", dim * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+    CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+    CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+    CeedQFunctionAddInput(qf_mass, "u", num_comp, CEED_EVAL_INTERP);
+    CeedQFunctionAddOutput(qf_mass, "v", num_comp, CEED_EVAL_INTERP);
+    {
+      CeedQFunctionContext qf_context;
+
+      CeedQFunctionContextCreate(ceed, &qf_context);
+      CeedQFunctionContextSetData(qf_context, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(CeedInt), &num_comp);
+      CeedQFunctionSetContext(qf_mass, qf_context);
+      CeedQFunctionContextDestroy(&qf_context);
+    }
+
+    // Operators
+    CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+    CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+    CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+    CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+    CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+    CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+    CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
+
+    // Apply Setup Operator
+    CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
+
+    // Fully assemble operator
+    CeedSize   num_entries;
+    CeedInt   *rows;
+    CeedInt   *cols;
+    CeedVector assembled;
+
+    for (CeedInt k = 0; k < num_comp * num_comp * num_dofs * num_dofs; ++k) {
+      assembled_values[k] = 0.0;
+      assembled_true[k]   = 0.0;
+    }
+    CeedOperatorLinearAssembleSymbolic(op_mass, &num_entries, &rows, &cols);
+    CeedVectorCreate(ceed, num_entries, &assembled);
+    CeedOperatorLinearAssemble(op_mass, assembled);
+    {
+      const CeedScalar *assembled_array;
+
+      CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array);
+      for (CeedInt k = 0; k < num_entries; k++) {
+        assembled_values[rows[k] * num_comp * num_dofs + cols[k]] += assembled_array[k];
+      }
+      CeedVectorRestoreArrayRead(assembled, &assembled_array);
+    }
+
+    // Manually assemble operator
+    CeedVectorSetValue(u, 0.0);
+    for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
+      CeedScalar       *u_array;
+      const CeedScalar *v_array;
+
+      // Set input
+      CeedVectorGetArray(u, CEED_MEM_HOST, &u_array);
+      u_array[j] = 1.0;
+      if (j) u_array[j - 1] = 0.0;
+      CeedVectorRestoreArray(u, &u_array);
+
+      // Compute entries for column j
+      CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      for (CeedInt i = 0; i < num_comp * num_dofs; i++) assembled_true[i * num_comp * num_dofs + j] = v_array[i];
+      CeedVectorRestoreArrayRead(v, &v_array);
+    }
+
+    // Check output
+    for (CeedInt i = 0; i < num_comp * num_dofs; i++) {
+      for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
+        if (fabs(assembled_values[i * num_dofs * num_comp + j] - assembled_true[i * num_dofs * num_comp + j]) > 100. * CEED_EPSILON) {
+          // LCOV_EXCL_START
+          printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_dofs * num_comp + j],
+                 assembled_true[i * num_dofs * num_comp + j]);
+          // LCOV_EXCL_STOP
+        }
+      }
+    }
+
+    // Cleanup
+    free(rows);
+    free(cols);
+    CeedVectorDestroy(&x);
+    CeedVectorDestroy(&x_points);
+    CeedVectorDestroy(&q_data);
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&v);
+    CeedVectorDestroy(&assembled);
+    CeedElemRestrictionDestroy(&elem_restriction_u);
+    CeedElemRestrictionDestroy(&elem_restriction_x);
+    CeedElemRestrictionDestroy(&elem_restriction_x_points);
+    CeedElemRestrictionDestroy(&elem_restriction_q_data);
+    CeedBasisDestroy(&basis_u);
+    CeedBasisDestroy(&basis_x);
+    CeedQFunctionDestroy(&qf_setup);
+    CeedQFunctionDestroy(&qf_mass);
+    CeedOperatorDestroy(&op_setup);
+    CeedOperatorDestroy(&op_mass);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t596-operator.h b/tests/t596-operator.h
new file mode 100644
index 0000000000..85dc60e259
--- /dev/null
+++ b/tests/t596-operator.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar *weight = in[0], *J = in[1];
+  CeedScalar       *rho = out[0];
+
+  for (CeedInt i = 0; i < Q; i++) {
+    rho[i] = weight[i] * (J[i + Q * 0] * J[i + Q * 3] - J[i + Q * 1] * J[i + Q * 2]);
+  }
+  return 0;
+}
+
+CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  CeedInt           num_comp = *(CeedInt *)ctx;
+  const CeedScalar *rho = in[0], *u = in[1];
+  CeedScalar       *v = out[0];
+
+  for (CeedInt i = 0; i < Q; i++) {
+    for (CeedInt c = 0; c < num_comp; c++) v[i + c * Q] = rho[i] * c * u[i + c * Q];
+  }
+  return 0;
+}
diff --git a/tests/t597-operator.c b/tests/t597-operator.c
new file mode 100644
index 0000000000..25d6b3cf3f
--- /dev/null
+++ b/tests/t597-operator.c
@@ -0,0 +1,203 @@
+/// @file
+/// Test full assembly of Poisson operator AtPoints
+/// \test Test full assembly of Poisson operator AtPoints
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "t597-operator.h"
+
+int main(int argc, char **argv) {
+  Ceed ceed;
+
+  CeedInit(argv[1], &ceed);
+
+  for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) {
+    CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data;
+    CeedBasis           basis_x, basis_u;
+    CeedQFunction       qf_setup, qf_diff;
+    CeedOperator        op_setup, op_diff;
+    CeedVector          q_data, x, x_points, u, v;
+    CeedInt             p = 3, q = 4, dim = 2;
+    CeedInt             n_x = 3, n_y = 2;
+    CeedInt             num_elem = n_x * n_y;
+    CeedInt             num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+    CeedInt             ind_x[num_elem * p * p];
+    CeedScalar          assembled_values[num_comp * num_comp * num_dofs * num_dofs];
+    CeedScalar          assembled_true[num_comp * num_comp * num_dofs * num_dofs];
+
+    // Points
+    CeedVectorCreate(ceed, dim * num_points, &x_points);
+    {
+      CeedScalar x_array[dim * num_points];
+
+      for (CeedInt e = 0; e < num_elem; e++) {
+        for (CeedInt d = 0; d < dim; d++) {
+          x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+          x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+        }
+      }
+      CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    {
+      CeedInt ind_x[num_elem + 1 + num_points];
+
+      for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+      for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+      CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                        &elem_restriction_x_points);
+      CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim * (dim + 1) / 2, num_points * dim * (dim + 1) / 2, CEED_MEM_HOST,
+                                        CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+    }
+
+    // Vectors
+    CeedVectorCreate(ceed, dim * num_dofs, &x);
+    {
+      CeedScalar x_array[dim * num_dofs];
+
+      for (CeedInt i = 0; i < n_x * 2 + 1; i++) {
+        for (CeedInt j = 0; j < n_y * 2 + 1; j++) {
+          x_array[i + j * (n_x * 2 + 1) + 0 * num_dofs] = (CeedScalar)i / (2 * n_x);
+          x_array[i + j * (n_x * 2 + 1) + 1 * num_dofs] = (CeedScalar)j / (2 * n_y);
+        }
+      }
+      CeedVectorSetArray(x, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+    CeedVectorCreate(ceed, num_comp * num_dofs, &u);
+    CeedVectorCreate(ceed, num_comp * num_dofs, &v);
+    CeedVectorCreate(ceed, num_points * dim * (dim + 1) / 2, &q_data);
+
+    // Restrictions
+    for (CeedInt i = 0; i < num_elem; i++) {
+      CeedInt col, row, offset;
+      col    = i % n_x;
+      row    = i / n_x;
+      offset = col * (p - 1) + row * (n_x * 2 + 1) * (p - 1);
+      for (CeedInt j = 0; j < p; j++) {
+        for (CeedInt k = 0; k < p; k++) ind_x[p * (p * i + k) + j] = offset + k * (n_x * 2 + 1) + j;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, num_dofs, dim * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x, &elem_restriction_x);
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, num_comp, num_dofs, num_comp * num_dofs, CEED_MEM_HOST, CEED_USE_POINTER, ind_x,
+                              &elem_restriction_u);
+
+    // Bases
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, p, q, CEED_GAUSS, &basis_x);
+    CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, p, q, CEED_GAUSS, &basis_u);
+
+    // QFunction - setup
+    CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+    CeedQFunctionAddInput(qf_setup, "dx", dim * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+    CeedQFunctionAddOutput(qf_setup, "q data", dim * (dim + 1) / 2, CEED_EVAL_NONE);
+
+    // Operator - setup
+    CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+    CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+    CeedOperatorSetField(op_setup, "q data", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+    CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+    // Apply Setup Operator
+    CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
+
+    // QFunction - apply
+    CeedQFunctionCreateInterior(ceed, 1, diff, diff_loc, &qf_diff);
+    CeedQFunctionAddInput(qf_diff, "du", num_comp * dim, CEED_EVAL_GRAD);
+    CeedQFunctionAddInput(qf_diff, "q data", dim * (dim + 1) / 2, CEED_EVAL_NONE);
+    CeedQFunctionAddOutput(qf_diff, "dv", num_comp * dim, CEED_EVAL_GRAD);
+    {
+      CeedQFunctionContext qf_context;
+
+      CeedQFunctionContextCreate(ceed, &qf_context);
+      CeedQFunctionContextSetData(qf_context, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(CeedInt), &num_comp);
+      CeedQFunctionSetContext(qf_diff, qf_context);
+      CeedQFunctionContextDestroy(&qf_context);
+    }
+
+    // Operator - apply
+    CeedOperatorCreateAtPoints(ceed, qf_diff, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff);
+    CeedOperatorSetField(op_diff, "du", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetField(op_diff, "q data", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+    CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorAtPointsSetPoints(op_diff, elem_restriction_x_points, x_points);
+
+    // Fully assemble operator
+    CeedSize   num_entries;
+    CeedInt   *rows;
+    CeedInt   *cols;
+    CeedVector assembled;
+
+    for (CeedInt k = 0; k < num_comp * num_comp * num_dofs * num_dofs; ++k) {
+      assembled_values[k] = 0.0;
+      assembled_true[k]   = 0.0;
+    }
+    CeedOperatorLinearAssembleSymbolic(op_diff, &num_entries, &rows, &cols);
+    CeedVectorCreate(ceed, num_entries, &assembled);
+    CeedOperatorLinearAssemble(op_diff, assembled);
+    {
+      const CeedScalar *assembled_array;
+
+      CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array);
+      for (CeedInt k = 0; k < num_entries; k++) assembled_values[rows[k] * num_comp * num_dofs + cols[k]] += assembled_array[k];
+      CeedVectorRestoreArrayRead(assembled, &assembled_array);
+    }
+
+    // Manually assemble operator
+    CeedVectorSetValue(u, 0.0);
+    for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
+      CeedScalar       *u_array;
+      const CeedScalar *v_array;
+
+      // Set input
+      CeedVectorGetArray(u, CEED_MEM_HOST, &u_array);
+      u_array[j] = 1.0;
+      if (j) u_array[j - 1] = 0.0;
+      CeedVectorRestoreArray(u, &u_array);
+
+      // Compute entries for column j
+      CeedOperatorApply(op_diff, u, v, CEED_REQUEST_IMMEDIATE);
+
+      CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+      for (CeedInt i = 0; i < num_comp * num_dofs; i++) assembled_true[i * num_comp * num_dofs + j] = v_array[i];
+      CeedVectorRestoreArrayRead(v, &v_array);
+    }
+
+    // Check output
+    for (CeedInt i = 0; i < num_comp * num_dofs; i++) {
+      for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
+        if (fabs(assembled_values[i * num_comp * num_dofs + j] - assembled_true[i * num_comp * num_dofs + j]) > 100. * CEED_EPSILON) {
+          // LCOV_EXCL_START
+          printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_comp * num_dofs + j],
+                 assembled_true[i * num_comp * num_dofs + j]);
+          // LCOV_EXCL_STOP
+        }
+      }
+    }
+
+    // Cleanup
+    free(rows);
+    free(cols);
+    CeedVectorDestroy(&x);
+    CeedVectorDestroy(&x_points);
+    CeedVectorDestroy(&q_data);
+    CeedVectorDestroy(&u);
+    CeedVectorDestroy(&v);
+    CeedVectorDestroy(&assembled);
+    CeedElemRestrictionDestroy(&elem_restriction_u);
+    CeedElemRestrictionDestroy(&elem_restriction_x);
+    CeedElemRestrictionDestroy(&elem_restriction_x_points);
+    CeedElemRestrictionDestroy(&elem_restriction_q_data);
+    CeedBasisDestroy(&basis_u);
+    CeedBasisDestroy(&basis_x);
+    CeedQFunctionDestroy(&qf_setup);
+    CeedQFunctionDestroy(&qf_diff);
+    CeedOperatorDestroy(&op_setup);
+    CeedOperatorDestroy(&op_diff);
+  }
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t597-operator.h b/tests/t597-operator.h
new file mode 100644
index 0000000000..57b8e0dec6
--- /dev/null
+++ b/tests/t597-operator.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store
+  // the symmetric part of the result.
+
+  // in[0] is Jacobians with shape [2, nc=2, Q]
+  // in[1] is quadrature weights, size (Q)
+  const CeedScalar *J = in[0], *qw = in[1];
+
+  // out[0] is qdata, size (Q)
+  CeedScalar *qd = out[0];
+
+  // Quadrature point loop
+  for (CeedInt i = 0; i < Q; i++) {
+    // J: 0 2   qd: 0 2   adj(J):  J22 -J12
+    //    1 3       2 1           -J21  J11
+    const CeedScalar J11 = J[i + Q * 0];
+    const CeedScalar J21 = J[i + Q * 1];
+    const CeedScalar J12 = J[i + Q * 2];
+    const CeedScalar J22 = J[i + Q * 3];
+    const CeedScalar w   = qw[i] / (J11 * J22 - J21 * J12);
+    qd[i + Q * 0]        = w * (J12 * J12 + J22 * J22);
+    qd[i + Q * 2]        = w * (J11 * J11 + J21 * J21);
+    qd[i + Q * 1]        = -w * (J11 * J12 + J21 * J22);
+  }
+
+  return 0;
+}
+
+CEED_QFUNCTION(diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  CeedInt num_comp = *(CeedInt *)ctx;
+  // in[0] is gradient u, shape [2, nc=1, Q]
+  // in[1] is quadrature data, size (3*Q)
+  const CeedScalar *du = in[0], *qd = in[1];
+
+  // out[0] is output to multiply against gradient v, shape [2, nc=1, Q]
+  CeedScalar *dv = out[0];
+
+  // Quadrature point loop
+  for (CeedInt i = 0; i < Q; i++) {
+    for (CeedInt c = 0; c < num_comp; c++) {
+      const CeedScalar du0 = du[i + Q * (2 * c + 0)];
+      const CeedScalar du1 = du[i + Q * (2 * c + 1)];
+
+      dv[i + Q * (2 * c + 0)] = qd[i + Q * 0] * du0 + qd[i + Q * 2] * du1;
+      dv[i + Q * (2 * c + 1)] = qd[i + Q * 2] * du0 + qd[i + Q * 1] * du1;
+    }
+  }
+
+  return 0;
+}
diff --git a/tests/t598-operator.c b/tests/t598-operator.c
new file mode 100644
index 0000000000..55c7560fbb
--- /dev/null
+++ b/tests/t598-operator.c
@@ -0,0 +1,279 @@
+/// @file
+/// Test creation, action, and destruction for mass matrix operator AtPoints
+/// \test Test creation, action, and destruction for mass matrix operator AtPoints
+#include "t591-operator.h"
+
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char **argv) {
+  Ceed                ceed;
+  CeedInt             num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p_coarse = 2, p_fine = 3, q = 5;
+  CeedInt             num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedInt             num_nodes_coarse = (num_elem_1d * (p_coarse - 1) + 1) * (num_elem_1d * (p_coarse - 1) + 1);
+  CeedInt             num_nodes_fine   = (num_elem_1d * (p_fine - 1) + 1) * (num_elem_1d * (p_fine - 1) + 1);
+  CeedVector          x_points, x_elem, q_data, u_coarse, u_fine, v_coarse, v_fine, p_mult_fine;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_x, elem_restriction_u_coarse, elem_restriction_u_fine;
+  CeedBasis           basis_x, basis_u_coarse, basis_u_fine;
+  CeedQFunction       qf_setup, qf_mass;
+  CeedOperator        op_setup, op_mass_coarse, op_mass_fine, op_prolong, op_restrict;
+
+  CeedInit(argv[1], &ceed);
+
+  // Point reference coordinates
+  CeedVectorCreate(ceed, dim * num_points, &x_points);
+  {
+    CeedScalar x_array[dim * num_points];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      for (CeedInt d = 0; d < dim; d++) {
+        x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+      }
+    }
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  {
+    CeedInt ind_x[num_elem + 1 + num_points];
+
+    for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+    for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+  }
+
+  // Q data
+  CeedVectorCreate(ceed, num_points, &q_data);
+
+  // Cell coordinates
+  {
+    CeedInt p = 2, num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1);
+    CeedInt ind_x[num_elem * p * p];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_x + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p;
+        }
+        elem_nodes[n] = p * g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, dim, 1, dim * num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_x);
+    CeedVectorCreate(ceed, dim * num_nodes, &x_elem);
+    {
+      CeedScalar x_array[dim * num_nodes];
+
+      for (CeedInt i = 0; i <= num_elem_1d; i++) {
+        for (CeedInt j = 0; j <= num_elem_1d; j++) {
+          x_array[(i * (num_elem_1d + 1) + j) * dim + 0] = j;
+          x_array[(i * (num_elem_1d + 1) + j) * dim + 1] = i;
+        }
+      }
+      CeedVectorSetArray(x_elem, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+    }
+  }
+
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x);
+
+  // Cell solution
+  {
+    CeedInt ind_u[num_elem * p_coarse * p_coarse];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p_coarse - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p_coarse * p_coarse, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p_coarse - 1) + r_node % p_coarse) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p_coarse;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p_coarse * p_coarse, 1, 1, num_nodes_coarse, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u,
+                              &elem_restriction_u_coarse);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_coarse, q, CEED_GAUSS, &basis_u_coarse);
+  {
+    CeedInt ind_u[num_elem * p_fine * p_fine];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p_fine - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p_fine * p_fine, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p_fine - 1) + r_node % p_fine) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p_fine;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p_fine * p_fine, 1, 1, num_nodes_fine, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u,
+                              &elem_restriction_u_fine);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p_fine, q, CEED_GAUSS, &basis_u_fine);
+
+  // Setup geometric scaling
+  CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_setup);
+  CeedQFunctionAddInput(qf_setup, "x", dim * dim, CEED_EVAL_GRAD);
+  CeedQFunctionAddInput(qf_setup, "weight", 1, CEED_EVAL_WEIGHT);
+  CeedQFunctionAddOutput(qf_setup, "rho", 1, CEED_EVAL_NONE);
+
+  CeedOperatorCreateAtPoints(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup);
+  CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
+  CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
+
+  CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
+
+  // Mass operator
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_fine);
+  CeedOperatorSetField(op_mass_fine, "u", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass_fine, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass_fine, "v", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_mass_fine, elem_restriction_x_points, x_points);
+
+  CeedVectorCreate(ceed, num_nodes_fine, &u_fine);
+  CeedVectorCreate(ceed, num_nodes_fine, &v_fine);
+  CeedVectorCreate(ceed, num_nodes_fine, &p_mult_fine);
+  CeedVectorCreate(ceed, num_nodes_coarse, &u_coarse);
+  CeedVectorCreate(ceed, num_nodes_coarse, &v_coarse);
+
+  // Create multigrid level
+  CeedVectorSetValue(p_mult_fine, 1.0);
+  CeedOperatorMultigridLevelCreate(op_mass_fine, p_mult_fine, elem_restriction_u_coarse, basis_u_coarse, &op_mass_coarse, &op_prolong, &op_restrict);
+
+  // Coarse problem
+  CeedVectorSetValue(u_coarse, 1.0);
+  CeedOperatorApply(op_mass_coarse, u_coarse, v_coarse, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum = 0.;
+
+    CeedVectorGetArrayRead(v_coarse, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_coarse; i++) {
+      sum += v_array[i];
+    }
+    CeedVectorRestoreArrayRead(v_coarse, &v_array);
+    if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum);
+  }
+
+  // Prolong coarse u
+  CeedOperatorApply(op_prolong, u_coarse, u_fine, CEED_REQUEST_IMMEDIATE);
+
+  // Fine problem
+  CeedOperatorApply(op_mass_fine, u_fine, v_fine, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum = 0.;
+
+    CeedVectorGetArrayRead(v_fine, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_fine; i++) {
+      sum += v_array[i];
+    }
+    CeedVectorRestoreArrayRead(v_fine, &v_array);
+
+    if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum);
+  }
+  // Restrict state to coarse grid
+  CeedOperatorApply(op_restrict, v_fine, v_coarse, CEED_REQUEST_IMMEDIATE);
+
+  // Check output
+  {
+    const CeedScalar *v_array;
+    CeedScalar        sum = 0.;
+
+    CeedVectorGetArrayRead(v_coarse, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes_coarse; i++) {
+      sum += v_array[i];
+    }
+    CeedVectorRestoreArrayRead(v_coarse, &v_array);
+    if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum);
+  }
+
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&q_data);
+  CeedVectorDestroy(&x_elem);
+  CeedVectorDestroy(&u_coarse);
+  CeedVectorDestroy(&u_fine);
+  CeedVectorDestroy(&v_fine);
+  CeedVectorDestroy(&v_coarse);
+  CeedVectorDestroy(&p_mult_fine);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_x);
+  CeedElemRestrictionDestroy(&elem_restriction_u_coarse);
+  CeedElemRestrictionDestroy(&elem_restriction_u_fine);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u_coarse);
+  CeedBasisDestroy(&basis_u_fine);
+  CeedQFunctionDestroy(&qf_setup);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_setup);
+  CeedOperatorDestroy(&op_mass_coarse);
+  CeedOperatorDestroy(&op_mass_fine);
+  CeedOperatorDestroy(&op_prolong);
+  CeedOperatorDestroy(&op_restrict);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t599-operator.c b/tests/t599-operator.c
new file mode 100644
index 0000000000..a38d6b1f47
--- /dev/null
+++ b/tests/t599-operator.c
@@ -0,0 +1,148 @@
+/// @file
+/// Test creation, action, and destruction for mass matrix operator at points using sequential composite operator
+/// \test Test creation, action, and destruction for mass matrix operator at points using sequential composite operator
+#include "t599-operator.h"
+
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  Ceed    ceed;
+  CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p = 3, q = 5;
+  CeedInt num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedVector          x_points, u, v, u_points;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_u_points, elem_restriction_u;
+  CeedBasis           basis_u;
+  CeedQFunction       qf_to_points, qf_from_points;
+  CeedOperator        op_to_points, op_from_points, op_mass;
+  bool                is_at_points, is_sequential;
+
+  CeedInit(argv[1], &ceed);
+
+  CeedVectorCreate(ceed, dim * num_points, &x_points);
+  {
+    CeedScalar x_array[dim * num_points];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      for (CeedInt d = 0; d < dim; d++) {
+        x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+      }
+    }
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  {
+    CeedInt ind_x[num_elem + 1 + num_points];
+
+    for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+    for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_u_points);
+    CeedElemRestrictionCreateVector(elem_restriction_u_points, &u_points, NULL);
+    CeedVectorSetValue(u_points, 0);
+  }
+
+  {
+    CeedInt ind_u[num_elem * p * p];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, &elem_restriction_u);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_to_points);
+  CeedQFunctionAddInput(qf_to_points, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddOutput(qf_to_points, "u_points", 1, CEED_EVAL_NONE);
+
+  CeedOperatorCreateAtPoints(ceed, qf_to_points, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_to_points);
+  CeedOperatorSetField(op_to_points, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_to_points, "u_points", elem_restriction_u_points, CEED_BASIS_NONE, u_points);
+  CeedOperatorAtPointsSetPoints(op_to_points, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_to_points, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_from_points);
+  CeedQFunctionAddInput(qf_from_points, "u_points", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_from_points, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_from_points, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_from_points);
+  CeedOperatorSetField(op_from_points, "u_points", elem_restriction_u_points, CEED_BASIS_NONE, u_points);
+  CeedOperatorSetField(op_from_points, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_from_points, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_from_points, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  CeedOperatorCreateComposite(ceed, &op_mass);
+  CeedOperatorCompositeSetSequential(op_mass, true);
+  CeedOperatorCompositeAddSub(op_mass, op_to_points);
+  CeedOperatorCompositeAddSub(op_mass, op_from_points);
+
+  CeedVectorCreate(ceed, num_nodes, &u);
+  CeedVectorSetValue(u, 1.0);
+  CeedVectorCreate(ceed, num_nodes, &v);
+  CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE);
+
+  CeedOperatorCompositeIsSequential(op_mass, &is_sequential);
+  if (!is_sequential) printf("Error: Composite operator should be sequential\n");
+
+  {
+    CeedScalar        sum = 0.0;
+    const CeedScalar *v_array;
+
+    CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
+    for (CeedInt i = 0; i < num_nodes; i++) sum += v_array[i];
+    CeedVectorRestoreArrayRead(v, &v_array);
+    // Summing 9 reference elements, each 2x2 => 36 sq units area
+    if (fabs(sum - 4.0 * num_elem) > CEED_EPSILON * 5e3) {
+      // LCOV_EXCL_START
+      printf("Incorrect area computed, %g != %g (abs error %g)\n", sum, 4.0 * num_elem, fabs(sum - 4.0 * num_elem));
+      // LCOV_EXCL_STOP
+    }
+  }
+
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&u_points);
+  CeedVectorDestroy(&u);
+  CeedVectorDestroy(&v);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_u_points);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedBasisDestroy(&basis_u);
+  CeedQFunctionDestroy(&qf_to_points);
+  CeedQFunctionDestroy(&qf_from_points);
+  CeedOperatorDestroy(&op_to_points);
+  CeedOperatorDestroy(&op_from_points);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t599-operator.h b/tests/t599-operator.h
new file mode 100644
index 0000000000..c50595bc26
--- /dev/null
+++ b/tests/t599-operator.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar *u = in[0];
+  CeedScalar       *v = out[0];
+
+  for (CeedInt i = 0; i < Q; i++) v[i] = u[i];
+  return 0;
+}
diff --git a/tests/test-include/fake-sys-include.h b/tests/test-include/fake-sys-include.h
new file mode 100644
index 0000000000..edb954cb54
--- /dev/null
+++ b/tests/test-include/fake-sys-include.h
@@ -0,0 +1,14 @@
+#define FAKE_SYS_SCALE_ONE 1
+
+// Note - files included this way cannot transitively include any files CUDA/ROCm won't compile
+// These are bad and need to be guarded
+#ifndef CEED_RUNNING_JIT_PASS
+#include <math.h>
+#include <stddef.h>
+#endif
+
+// These are ok
+// Note - ceed/types.h should be used over ceed.h
+//        ceed.h is replaced with ceed/types.h during JiT
+#include <ceed.h>
+#include <ceed/types.h>